<<<<<<< HEAD ======= <<<<<<< HEAD >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e <<<<<<< HEAD ======= >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e Predictive mapping of wholesale grain prices for rural areas in Tanzania <<<<<<< HEAD ======= >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
<<<<<<< HEAD

Suggested citation: Madaga, Lavinia, Jordan Chamberlin, Bisrat Gebrekidan, Robert Hijmans, Nicholaus Kuboja, and Maxwell Mkondiwa. 2024. “Predictive mapping of wholesale grain prices for rural areas in Tanzania.” https://github.com/EiA2030-ex-ante/Tanzania-Price-Data

======= >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Introduction

We are interested in estimating the prices of agricultural food commodities across space and time, on the basis of prices as observed at some market locations. Here we explore methods to model these prices, <<<<<<< HEAD using monthly data from across Tanzania over the period May 2021-July ======= using monthly data from across Tanzania over the period May 2021-June >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e 2024.

This document describes the process of fitting a Random Forest model to predict these prices. The performance of the Random Forest model is evaluated using Root Mean Square Errors (RMSE) and R-Square as test statistics. We also compare the observed prices with the predicted prices using an out of sample dataset.

Load Libraries

library(geodata)
## Warning: package 'geodata' was built under R version 4.3.3
library(lubridate)
library(terra)
library(data.table)
library(randomForest)
library(httr)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
<<<<<<< HEAD
## Warning: package 'ggplot2' was built under R version 4.3.3
======= >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
library(pdp)
## Warning: package 'pdp' was built under R version 4.3.3
library(gridExtra)
library(stats)
library(dplyr)
library(stringr)
library(fields)
## Warning: package 'fields' was built under R version 4.3.3
## Warning: package 'spam' was built under R version 4.3.3
<<<<<<< HEAD
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
library(ggplot2)
======= >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Data

This dataset contains price information for various crops across different regions and markets in Tanzania from 2021 to 2024. The data was acquired from Tanzania’s Ministry of Industry and Trade.

setwd("H:/Tanzania Price data/Datasets")

prices <- fread("Tanzania_Price_Data_AllCrops_with_Coordinates4.csv")
dim(prices)
<<<<<<< HEAD
## [1] 7912   21
=======
## [1] 7441   21
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
head(prices)
##           Region   Market Maize..min.price. Maize..max.price. Rice..min.price.
## 1:        Arusha   Arusha             43000             45000           140000
## 2: Dar es Salaam   Temeke             46000             47000           120000
## 3:        Dodoma  Majengo             45000             50000           132000
## 4:        Dodoma Kibaigwa             30000             42000               NA
## 5:        Kagera   Bukoba             33000             35000           110000
## 6:       Manyara   Babati             30000             45000           120000
##    Rice..max.price. Sorghum..min.price. Sorghum..max.price.
## 1:           170000               60000               65000
## 2:           210000               80000              100000
## 3:           200000               50000               60000
## 4:               NA               45000               48000
## 5:           140000              140000              150000
## 6:           170000               60000               80000
##    Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## 1:                      70000                      75000
## 2:                      80000                     100000
## 3:                      52000                      64000
## 4:                         NA                         NA
## 5:                     120000                     140000
## 6:                      80000                     100000
##    Finger.Millet..min.price. Finger.Millet..max.price. Wheat..min.price.
## 1:                    120000                    125000             70000
## 2:                    175000                    180000            110000
## 3:                    200000                    252000                NA
## 4:                        NA                        NA                NA
## 5:                    170000                    180000            170000
## 6:                    120000                    130000            100000
##    Wheat..max.price. Beans..min.price. Beans..max.price.
## 1:             78000            130000            165000
## 2:            120000            220000            260000
## 3:                NA            200000            245000
## 4:                NA                NA                NA
## 5:            180000            110000            170000
## 6:            120000            150000            180000
##    Irish.Potatoes..min.price. Irish.Potatoes..max.price.     Date Latitude
## 1:                      70000                      75000 5/5/2021 -3.36968
## 2:                      75000                      75000 5/5/2021 -6.85097
## 3:                      56000                      68000 5/5/2021 -6.17971
## 4:                         NA                         NA 5/5/2021 -6.08110
## 5:                      60000                      75000 5/5/2021 -1.33140
## 6:                      60000                      60000 5/5/2021 -4.21006
##    Longitude
## 1:  36.68808
## 2:  39.25672
## 3:  35.74109
## 4:  36.64645
## 5:  31.81293
## 6:  35.74915
table(prices$Market)
## 
##       Arusha       Babati      Bariadi     Buguruni       Bukoba        Geita 
<<<<<<< HEAD
##          374          243           78           10          390           38 
##      Igawilo        Ilala       Iringa     Kibaigwa       Kigoma    Kilombero 
##           38          203          392          211          247           38 
##    Kinondoni        Lindi      Majengo      Manyara        Mbeya     Mgandini 
##          203          246          438          116          117           38 
##     Morogoro        Moshi       Mpanda      Mpimbwe       Mtwara       Musoma 
##          386          252          195           39          400          287 
## Mwananyamala       Mwanza       Namfua       Njombe    Nyankumbu        Pwani 
##           38          349           38          174           38           78 
##    Shinyanga      Singida       Songea       Songwe   Sumbawanga       Tabora 
##          282           83          361           48          385          364 
##      Tandale      Tandika        Tanga       Temeke       Ubungo 
##           55           64          361          169           46
======= ## 371 227 64 10 373 38 ## Igawilo Ilala Iringa Kibaigwa Kigoma Kilimanjaro ## 24 187 375 208 227 13 ## Kilombero Kinondoni Lindi Majengo Manyara Mbeya ## 24 202 232 408 115 117 ## Mgandini Mnalani Morogoro Moshi Mpanda Mpimbwe ## 24 9 369 225 193 39 ## Mtwara Musoma Mwananyamala Mwanza Namfua Njombe ## 384 270 24 333 24 172 ## Nyankumbu Pwani Shinyanga Singida Songea Songwe ## 24 55 266 83 344 34 ## Sumbawanga Tabora Tandale Tandika Tanga Temeke ## 368 347 41 50 359 153 ## Ubungo Ujiji ## 32 4 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
sapply(prices, class)
##                     Region                     Market 
##                "character"                "character" 
##          Maize..min.price.          Maize..max.price. 
##                  "integer"                  "integer" 
##           Rice..min.price.           Rice..max.price. 
##                  "integer"                  "integer" 
##        Sorghum..min.price.        Sorghum..max.price. 
##                  "integer"                  "integer" 
## Bulrush.Millet..min.price. Bulrush.Millet..max.price. 
##                  "integer"                  "integer" 
##  Finger.Millet..min.price.  Finger.Millet..max.price. 
##                  "integer"                  "integer" 
##          Wheat..min.price.          Wheat..max.price. 
##                  "integer"                  "integer" 
##          Beans..min.price.          Beans..max.price. 
##                  "integer"                  "integer" 
## Irish.Potatoes..min.price. Irish.Potatoes..max.price. 
##                  "integer"                  "integer" 
##                       Date                   Latitude 
##                "character"                  "numeric" 
##                  Longitude 
##                  "numeric"
# Convert to date format
prices$Date <- lubridate::mdy(prices$Date)

Basic Data preperation

Check the Region and Market names and coodinates. Make sure the Region and Market names are harmonized and properly geocoded

unique(prices[Region=="Arusha",.(Market, Latitude, Longitude)])
##       Market Latitude Longitude
## 1:    Arusha -3.36968  36.68808
## 2: Kilombero -3.37324  36.67870
unique(prices[Region=="Dar es Salaam",.(Market, Latitude, Longitude)])
##          Market  Latitude Longitude
## 1:       Temeke -6.850970  39.25672
## 2:    Kinondoni -6.784070  39.27007
## 3:        Ilala -6.829410  39.26289
## 4:      Tandika -6.867912  39.25480
## 5:     Buguruni -6.838380  39.24359
## 6:      Tandale -6.795230  39.24085
## 7:       Ubungo -6.793620  39.20966
## 8: Mwananyamala -6.788890  39.25840
unique(prices[Region=="Dodoma",.(Market, Latitude, Longitude)])
##      Market Latitude Longitude
## 1:  Majengo -6.17971  35.74109
## 2: Kibaigwa -6.08110  36.64645
unique(prices[Region=="Kagera",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Bukoba  -1.3314  31.81293
unique(prices[Region=="Manyara",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1:  Babati -4.21006  35.74915
## 2: Manyara -4.46011  37.20217
unique(prices[Region=="Rukwa",.(Market, Latitude, Longitude)])
##        Market Latitude Longitude
## 1: Sumbawanga -7.95239  31.62319
unique(prices[Region=="Mpanda",.(Market, Latitude, Longitude)])
## Empty data.table (0 rows and 3 cols): Market,Latitude,Longitude
unique(prices[Region=="Mtwara",.(Market, Latitude, Longitude)])
##    Market  Latitude Longitude
## 1: Mtwara -10.28009  40.18191
unique(prices[Region=="Tabora",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Tabora -5.01972  32.80767
unique(prices[Region=="Tanga",.(Market, Latitude, Longitude)])
##      Market  Latitude Longitude
## 1:    Tanga -5.074260  39.09993
## 2: Mgandini -5.086235  39.09561
unique(prices[Region=="Iringa",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Iringa -7.78001  35.69671
unique(prices[Region=="Kigoma",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
<<<<<<< HEAD
## 1: Kigoma -4.89697  29.65987
======= ## 1: Kigoma -4.89697 29.65987 ## 2: Ujiji -4.90837 29.69203 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
unique(prices[Region=="Morogoro",.(Market, Latitude, Longitude)])
##      Market Latitude Longitude
## 1: Morogoro -6.82771  37.66542
unique(prices[Region=="Mwanza",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Mwanza -2.51969  32.90144
unique(prices[Region=="Mara",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Musoma -1.49982   33.8083
unique(prices[Region=="Ruvuma",.(Market, Latitude, Longitude)])
##    Market  Latitude Longitude
## 1: Songea -10.67873  35.64836
unique(prices[Region=="Shinyanga",.(Market, Latitude, Longitude)])
##       Market Latitude Longitude
## 1: Shinyanga -3.67226  33.43069
unique(prices[Region=="Kilimanjaro",.(Market, Latitude, Longitude)])
<<<<<<< HEAD
##    Market Latitude Longitude
## 1:  Moshi -3.34865  37.34352
=======
##         Market Latitude Longitude
## 1:       Moshi -3.34865  37.34352
## 2: Kilimanjaro -3.33854  37.32654
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
unique(prices[Region=="Mbeya",.(Market, Latitude, Longitude)])
##     Market  Latitude Longitude
## 1:   Mbeya -8.909940  33.45517
## 2: Igawilo -8.924246  33.56788
unique(prices[Region=="Katavi",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1:  Mpanda -6.34295  31.07299
## 2: Mpimbwe -7.24425  31.81782
## 3: Majengo -6.34809  31.07055
unique(prices[Region=="Njombe",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Njombe -9.33805  34.76949
unique(prices[Region=="Lindi",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1:  Lindi   -9.995    39.708
unique(prices[Region=="Singida",.(Market, Latitude, Longitude)])
##     Market  Latitude Longitude
## 1: Singida -4.812610   34.7428
## 2:  Namfua -4.821121   34.7470
unique(prices[Region=="Pwani",.(Market, Latitude, Longitude)])
<<<<<<< HEAD
##    Market Latitude Longitude
## 1:  Pwani -6.44221  38.90622
=======
##     Market Latitude Longitude
## 1:   Pwani -6.44221  38.90622
## 2: Mnalani -6.44221  38.90622
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
unique(prices[Region=="Simiyu",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1: Bariadi -2.80355  33.98699
unique(prices[Region=="Geita",.(Market, Latitude, Longitude)])
##       Market  Latitude Longitude
## 1:     Geita -2.870760  32.23408
## 2: Nyankumbu -2.895955  32.22871
unique(prices[Region=="Songwe",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Songwe -8.95179  33.24377
setnames(prices, old = "Maize..min.price.", new = "mai.price.min")
setnames(prices, old = "Rice..min.price.", new = "ric.price.min")
setnames(prices, old = "Sorghum..min.price.", new = "sor.price.min")
setnames(prices, old = "Bulrush.Millet..min.price.", new = "bul.price.min")
setnames(prices, old = "Finger.Millet..min.price.", new = "fin.price.min")
setnames(prices, old = "Wheat..min.price.", new = "whe.price.min")
setnames(prices, old = "Beans..min.price.", new = "bea.price.min")
setnames(prices, old = "Irish.Potatoes..min.price.", new = "pot.price.min")
setnames(prices, old = "Maize..max.price.", new = "mai.price.max")
setnames(prices, old = "Rice..max.price.", new = "ric.price.max")
setnames(prices, old = "Sorghum..max.price.", new = "sor.price.max")
setnames(prices, old = "Bulrush.Millet..max.price.", new = "bul.price.max")
setnames(prices, old = "Finger.Millet..max.price.", new = "fin.price.max")
setnames(prices, old = "Wheat..max.price.", new = "whe.price.max")
setnames(prices, old = "Beans..max.price.", new = "bea.price.max")
setnames(prices, old = "Irish.Potatoes..max.price.", new = "pot.price.max")
sapply(prices, class)
##        Region        Market mai.price.min mai.price.max ric.price.min 
##   "character"   "character"     "integer"     "integer"     "integer" 
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
## bea.price.max pot.price.min pot.price.max          Date      Latitude 
##     "integer"     "integer"     "integer"        "Date"     "numeric" 
##     Longitude 
##     "numeric"
#convert prices to numeric 
prices$mai.price.min <- as.numeric(prices$mai.price.min)
prices$ric.price.min <- as.numeric(prices$ric.price.min)
prices$sor.price.min <- as.numeric(prices$sor.price.min)
prices$bul.price.min <- as.numeric(prices$bul.price.min)
prices$fin.price.min <- as.numeric(prices$fin.price.min)
prices$whe.price.min <- as.numeric(prices$whe.price.min)
prices$bea.price.min <- as.numeric(prices$bea.price.min)
prices$pot.price.min <- as.numeric(prices$pot.price.min)

prices$mai.price.max <- as.numeric(prices$mai.price.max)
prices$ric.price.max <- as.numeric(prices$ric.price.max)
prices$sor.price.max <- as.numeric(prices$sor.price.max)
prices$bul.price.max <- as.numeric(prices$bul.price.max)
prices$fin.price.max <- as.numeric(prices$fin.price.max)
prices$whe.price.max <- as.numeric(prices$whe.price.max)
prices$bea.price.max <- as.numeric(prices$bea.price.max)
prices$pot.price.max <- as.numeric(prices$pot.price.max)

sapply(prices, class)
##        Region        Market mai.price.min mai.price.max ric.price.min 
##   "character"   "character"     "numeric"     "numeric"     "numeric" 
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max 
##     "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min 
##     "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
## bea.price.max pot.price.min pot.price.max          Date      Latitude 
##     "numeric"     "numeric"     "numeric"        "Date"     "numeric" 
##     Longitude 
##     "numeric"
# convert to price per kg
prices$mai.price.min <- prices$mai.price.min/100
prices$ric.price.min <- prices$ric.price.min/100
prices$sor.price.min <- prices$sor.price.min/100
prices$bul.price.min <- prices$bul.price.min/100
prices$fin.price.min <- prices$fin.price.min/100
prices$whe.price.min <- prices$whe.price.min/100
prices$bea.price.min <- prices$bea.price.min/100
prices$pot.price.min <- prices$pot.price.min/100

prices$mai.price.max <- prices$mai.price.max/100
prices$ric.price.max <- prices$ric.price.max/100
prices$sor.price.max <- prices$sor.price.max/100
prices$bul.price.max <- prices$bul.price.max/100
prices$fin.price.max <- prices$fin.price.max/100
prices$whe.price.max <- prices$whe.price.max/100
prices$bea.price.max <- prices$bea.price.max/100
prices$pot.price.max <- prices$pot.price.max/100
# calculate average of min and max
prices$mai.price <- (prices$mai.price.min + prices$mai.price.max) / 2
prices$ric.price <- (prices$ric.price.min + prices$ric.price.max) / 2
prices$sor.price <- (prices$sor.price.min + prices$sor.price.max) / 2
prices$bul.price <- (prices$bul.price.min + prices$bul.price.max) / 2
prices$fin.price <- (prices$fin.price.min + prices$fin.price.max) / 2
prices$whe.price <- (prices$whe.price.min + prices$whe.price.max) / 2
prices$bea.price <- (prices$bea.price.min + prices$bea.price.max) / 2
prices$pot.price <- (prices$pot.price.min + prices$pot.price.max) / 2
#We can add dates by using the year and the month names
prices$Day   <- day(prices$Date)
prices$Month <- month(prices$Date)
prices$Year  <- year(prices$Date)
# drop unneccessary columns
prices <- prices[,!c("mai.price.min", "mai.price.max",
                     "ric.price.min", "ric.price.max",
                     "sor.price.min", "sor.price.max",
                     "bul.price.min", "bul.price.max",
                     "fin.price.min", "fin.price.max",
                     "whe.price.min", "whe.price.max", 
                     "bea.price.min", "bea.price.max", 
                     "pot.price.min", "pot.price.max")]
# calculate monthly mean prices by market 
prices.monthly <- prices[, .(mai.price = mean(mai.price, na.rm = TRUE), 
           ric.price = mean(ric.price, na.rm = TRUE), 
           sor.price = mean(sor.price, na.rm = TRUE), 
           bul.price = mean(bul.price, na.rm = TRUE),
           fin.price = mean(fin.price, na.rm = TRUE), 
           whe.price = mean(whe.price, na.rm = TRUE), 
           bea.price = mean(bea.price, na.rm = TRUE), 
           pot.price = mean(pot.price, na.rm = TRUE)), 
       by=.(Region, Market, Month, Year, Latitude, Longitude)]
# reshape to long (so that prices for different commodities can be simultaneously estimated)
prices.monthly 
##             Region     Market Month Year Latitude Longitude mai.price ric.price
##   1:        Arusha     Arusha     5 2021 -3.36968  36.68808  469.0000  1530.000
##   2: Dar es Salaam     Temeke     5 2021 -6.85097  39.25672  485.0000  1650.000
##   3:        Dodoma    Majengo     5 2021 -6.17971  35.74109  501.5000  1592.000
##   4:        Dodoma   Kibaigwa     5 2021 -6.08110  36.64645  375.0000       NaN
##   5:        Kagera     Bukoba     5 2021 -1.33140  31.81293  426.2500  1260.000
##  ---                                                                           
<<<<<<< HEAD
## 860:        Mwanza     Mwanza     7 2024 -2.51969  32.90144  725.0000  2350.000
## 861:         Pwani      Pwani     7 2024 -6.44221  38.90622  835.7143  1946.429
## 862:        Simiyu    Bariadi     7 2024 -2.80355  33.98699  660.0000  1450.000
## 863:        Kigoma     Kigoma     7 2024 -4.89697  29.65987  578.5714  1417.857
## 864:         Rukwa Sumbawanga     7 2024 -7.95239  31.62319  546.7857  1653.571
=======
## 836:        Mwanza     Mwanza     6 2024 -2.51969  32.90144  725.0000  2350.000
## 837:         Pwani    Mnalani     6 2024 -6.44221  38.90622  772.2222  2000.000
## 838:        Simiyu    Bariadi     6 2024 -2.80355  33.98699  563.3333  1316.667
## 839:        Kigoma     Kigoma     6 2024 -4.89697  29.65987  504.2222  1488.889
## 840:         Rukwa Sumbawanga     6 2024 -7.95239  31.62319  533.3333  1750.000
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
##      sor.price bul.price fin.price whe.price bea.price pot.price
##   1:   695.000   761.000  1333.000  793.0000  1545.000  745.0000
##   2:   900.000   900.000  1775.000 1230.0000  2420.000  730.0000
##   3:   558.000   581.000  1752.000       NaN  2075.000  618.0000
##   4:   437.500       NaN       NaN       NaN       NaN       NaN
##   5:  1375.000  1337.500  1637.500 1637.5000  1300.000  675.0000
##  ---                                                            
<<<<<<< HEAD
## 860:  1428.571  1421.429  1621.429       NaN  2528.571 1142.8571
## 861:  1442.857  1550.000  1857.143 1975.0000  2967.857 1735.7143
## 862:  1400.000  1739.286  1728.571 2457.1429  2750.000 1500.0000
## 863:  1821.429  1892.857  1907.143 1867.8571  2228.571 1075.0000
## 864:       NaN       NaN   975.000  719.6429  2135.714  641.0714
======= ## 836: 1400.000 1350.000 1550.000 NaN 2400.000 1000.0000 ## 837: 1522.222 1544.444 2022.222 2400.0000 3116.667 1800.0000 ## 838: 1300.000 1788.889 1716.667 2316.6667 2805.556 1500.0000 ## 839: 1266.667 1266.667 1433.333 2122.2222 2216.667 1127.7778 ## 840: NaN NaN 1088.889 780.5556 1888.889 652.7778 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
prices.monthly.long <- melt(prices.monthly, id.vars=c('Region', 'Market', 'Month', 'Year', 'Latitude', 'Longitude'),)
# rename columns
setnames(prices.monthly.long, old="variable", new="Crop")
setnames(prices.monthly.long, old="value", new="pkg")
# replace crop names
prices.monthly.long[Crop == "mai.price", Crop := "Maize"]
prices.monthly.long[Crop == "ric.price", Crop := "Rice"]
prices.monthly.long[Crop == "sor.price", Crop := "Sorghum"]
prices.monthly.long[Crop == "bul.price", Crop := "B.Millet"]
prices.monthly.long[Crop == "fin.price", Crop := "F.Millet"]
prices.monthly.long[Crop == "whe.price", Crop := "Wheat"]
prices.monthly.long[Crop == "bea.price", Crop := "Beans"]
prices.monthly.long[Crop == "pot.price", Crop := "Potato"]
# Reset the factor levels to updated levels
prices.monthly.long[, Crop := factor(Crop)]
# Check the unique values again
unique(prices.monthly.long$Crop)
## [1] Maize    Rice     Sorghum  B.Millet F.Millet Wheat    Beans    Potato  
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# generate dummies to use in place of factors (for later spatial predictions, which are struggling with factors)
prices.monthly.long[, maize   := ifelse(Crop == "Maize",1,0)]
prices.monthly.long[, rice    := ifelse(Crop == "Rice",1,0)]
prices.monthly.long[, sorghum := ifelse(Crop == "Sorghum",1,0)]
prices.monthly.long[, bmillet := ifelse(Crop == "B.Millet",1,0)]
prices.monthly.long[, fmillet := ifelse(Crop == "F.Millet",1,0)]
prices.monthly.long[, wheat   := ifelse(Crop == "Wheat",1,0)]
prices.monthly.long[, beans   := ifelse(Crop == "Beans",1,0)]
prices.monthly.long[, potato  := ifelse(Crop == "Potato",1,0)]
prices.monthly.long[, jan := ifelse(Month == 1  , 1, 0)]
prices.monthly.long[, feb := ifelse(Month == 2  , 1, 0)]
prices.monthly.long[, mar := ifelse(Month == 3  , 1, 0)]
prices.monthly.long[, apr := ifelse(Month == 4  , 1, 0)]
prices.monthly.long[, may := ifelse(Month == 5  , 1, 0)]
prices.monthly.long[, jun := ifelse(Month == 6  , 1, 0)]
prices.monthly.long[, jul := ifelse(Month == 7  , 1, 0)]
prices.monthly.long[, aug := ifelse(Month == 8  , 1, 0)]
prices.monthly.long[, sep := ifelse(Month == 9  , 1, 0)]
prices.monthly.long[, oct := ifelse(Month == 10 , 1, 0)]
prices.monthly.long[, nov := ifelse(Month == 11 , 1, 0)]
prices.monthly.long[, dec := ifelse(Month == 12 , 1, 0)]
# replace NaN with NAs in the price observations
prices.monthly.long[is.nan(pkg), pkg := NA]
# Remove observations with missing observations
prices.monthly.long <- na.omit(prices.monthly.long)
# bring in raster stack as predictors
geodata_path("H:/Tanzania Price data/Datasets/geodata")
list.files("H:/Tanzania Price data/Datasets/geodata", recursive=TRUE)
##  [1] "climate/wc2.1_country/TZA_wc2.1_30s_bio.tif"
##  [2] "travel/travel_time_to_cities_u5.tif"        
##  [3] "TRUE/gadm/gadm41_TZA_0_pk.rds"              
##  [4] "TRUE/gadm/gadm41_TZA_1_pk.rds"              
##  [5] "TRUE/gadm/gadm41_TZA_2_pk.rds"              
##  [6] "TRUE/gadm/gadm41_TZA_3_pk.rds"              
##  [7] "TRUE/spam/spam2017v2r1_ssa_area.zip"        
##  [8] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif"    
##  [9] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_H.tif"    
## [10] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_I.tif"    
## [11] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_L.tif"    
## [12] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_R.tif"    
## [13] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_S.tif"    
## [14] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_A.tif"    
## [15] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_H.tif"    
## [16] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_I.tif"    
## [17] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_L.tif"    
## [18] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif"    
## [19] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_S.tif"    
## [20] "TRUE/spam/spam2017v2r1_ssa_yield.zip"       
## [21] "TRUE/travel/travel_time_to_cities_u5.tif"   
## [22] "TRUE/travel/travel_time_to_ports_1.tif"     
## [23] "TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif"
# tza0 <- gadm(country="TZA", level=0)
# tza1 <- gadm(country="TZA", level=1)
# tza2 <- gadm(country="TZA", level=2)
# tza3 <- gadm(country="TZA", level=3)

tza0 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_0_pk.rds")
tza1 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_1_pk.rds")
tza2 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_2_pk.rds")
tza3 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_3_pk.rds")
# convert prices observations to vector for mapping
mypts <- vect(prices.monthly.long, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)

# see if these show up correctly
plot(tza1)
plot(mypts, col="Red", add=TRUE)
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
# text(mypts, label="Market")
# create reference raster
tza_extent <- ext(tza1) |> floor()
r <- crop(rast(res=1/12), tza_extent)

Interpolate

## Interpolate

#xy <- as.matrix(mypts[,c("Longitude", "Latitude")])
xy <- geom(mypts)[,c("y","x")]
#tps <- Tps(xy, p$spatial)
tps <- Tps(xy, mypts$pkg)
sp <- interpolate(r, tps)
sp <- mask(sp, tza1)
plot(sp)
lines(tza1)
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Predict prices with coodinates only

rf <- randomForest(pkg ~ Longitude + Latitude , data=mypts)
sp3 <- interpolate(r, rf, xyNames=c("Longitude", "Latitude"))
sp3 <- mask(sp3, tza1)
plot(sp3)
lines(tza1)
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Covariates

The covariates used include a mix of crop-specific indicators, temporal variables to capture monthly and yearly effects, geographical coordinates, accessibility measures, climatic conditions, and lagged rainfall to account for delayed effects of weather on crop prices.

ttcity <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_cities_u5.tif")
ttport <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_ports_1.tif")
clm    <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif")
area   <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif")
yield  <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif")
popd  <- rast("gpw_v4_population_density_rev11_2020_10m.tif")
names(ttcity) <- c("ttcity_u5") ## travel time cities of 100k or more
names(ttport) <- c("ttport_1") ## travel time to major ports
names(clm) <- gsub("wc2.1_30s_", "", names(clm))
names(area) <- c("MAI_ARE") # SPAM maize area 2010
names(yield)  <- c("MAI_YLD") # SPAM maize yield 2010
names(popd)  <- c("popdens") # GPW4

comment(ttcity) <- "travel time to cities 100k or more"
comment(ttport) <- "travel time to major ports"

comment(popd) <- "population density 2020 (GPW4 @ 10dm)"

comment(clm)[1] <-"BIO1 = Annual Mean Temperature"
comment(clm)[2] <-"BIO2 = Mean Diurnal Range (Mean of monthly (max temp - min temp))"
comment(clm)[3] <-"BIO3 = Isothermality (BIO2/BIO7) (×100)"
comment(clm)[4] <-"BIO4 = Temperature Seasonality (standard deviation ×100)"
comment(clm)[5] <-"BIO5 = Max Temperature of Warmest Month"
comment(clm)[6] <-"BIO6 = Min Temperature of Coldest Month"
comment(clm)[7] <-"BIO7 = Temperature Annual Range (BIO5-BIO6)"
comment(clm)[8] <-"BIO8 = Mean Temperature of Wettest Quarter"
comment(clm)[9] <-"BIO9 = Mean Temperature of Driest Quarter"
comment(clm)[10] <-"BIO10 = Mean Temperature of Warmest Quarter"
comment(clm)[11] <-"BIO11 = Mean Temperature of Coldest Quarter"
comment(clm)[12] <-"BIO12 = Annual Precipitation"
comment(clm)[13] <-"BIO13 = Precipitation of Wettest Month"
comment(clm)[14] <-"BIO14 = Precipitation of Driest Month"
comment(clm)[15] <-"BIO15 = Precipitation Seasonality (Coefficient of Variation)"
comment(clm)[16] <-"BIO16 = Precipitation of Wettest Quarter"
comment(clm)[17] <-"BIO17 = Precipitation of Driest Quarter"
comment(clm)[18] <-"BIO18 = Precipitation of Warmest Quarter"
comment(clm)[19] <-"BIO19 = Precipitation of Coldest Quarter"

Harmonize rasters to national boundaries and common resolution

ttcity <- resample(ttcity, r)
ttport <- resample(ttport, r)
clm    <- resample(clm, r)
area   <- resample(area, r)
popd   <- resample(popd, r)
freq(is.na(area))
##   layer value count
## 1     1     0 14393
## 2     1     1  6343
area <- classify(area, cbind(NA,0)) 
yield  <- resample(yield, r)
freq(is.na(yield))
##   layer value count
## 1     1     0 14393
## 2     1     1  6343
yield <- classify(yield, cbind(NA,0)) 
# check again 
compareGeom(ttcity, ttport, clm, area, yield, popd)
## [1] TRUE

Generate Latitude and Longitude grid

latgrd <- longrd <- r
latgrd[] <- yFromCell(latgrd, 1:ncell(latgrd))
longrd[] <- xFromCell(longrd, 1:ncell(longrd))
names(latgrd) <- c("latitude")
names(longrd) <- c("longitude")

Prepare Predictor Stack

rstack <- c(ttcity, ttport, clm, area, yield, popd, latgrd, longrd)
names(rstack)
##  [1] "ttcity_u5" "ttport_1"  "bio_1"     "bio_2"     "bio_3"     "bio_4"    
##  [7] "bio_5"     "bio_6"     "bio_7"     "bio_8"     "bio_9"     "bio_10"   
## [13] "bio_11"    "bio_12"    "bio_13"    "bio_14"    "bio_15"    "bio_16"   
## [19] "bio_17"    "bio_18"    "bio_19"    "MAI_ARE"   "MAI_YLD"   "popdens"  
## [25] "latitude"  "longitude"
# create focal mean to extract from (as alternative to using buffers for extraction, which are not supported in terra)
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack, w=fm, fun="mean", na.policy="all", fillvalue=NA, # na.rm=TRUE,
                 expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE) 
# extract values to dataset -- use a 20km buffer
# do a focal sum of 20km radius  - this is about 0.18 of a decimal degree... 0.18*112=20.16
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack2, w=fm, fun="sum", na.policy="all", fillvalue=NA, na.rm=TRUE,
                 expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE) 

Lag Rainfall

Bring in Rainfall Data from Chirps

chirps_path <- "H:/Tanzania Price data/chirps_data"

chirps_files <- list.files(chirps_path, pattern = ".tif$", full.names = TRUE)

# Read all CHIRPS data files into a SpatRaster collection
chirps_rasters <- rast(chirps_files)

#crop to Tanzania boundary
Chirps_Tz <- crop(chirps_rasters, tza1)

writeRaster(Chirps_Tz, "Tz_chirps_monthly_croped.tif", overwrite=TRUE)

Tz_chirps_monthly <- terra::rast("Tz_chirps_monthly_croped.tif")
Tz_chirps_monthly
## class       : SpatRaster 
<<<<<<< HEAD
## dimensions  : 215, 222, 45  (nrow, ncol, nlyr)
=======
## dimensions  : 215, 222, 44  (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution  : 0.05, 0.05  (x, y)
## extent      : 29.35, 40.45, -11.75, -1.000001  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source      : Tz_chirps_monthly_croped.tif 
## names       : chirp~20.11, chirp~20.12, chirp~21.01, chirp~21.02, chirp~21.03, chirp~21.04, ... 
## min values  :  -9999.0000,  -9999.0000,   -9999.000,  -9999.0000,  -9999.0000,  -9999.0000, ... 
## max values  :    459.7112,    504.3329,     508.448,    585.3241,    750.1949,    960.3376, ...
#Replace -9999 with NA
Tz_chirps_monthly <- classify(Tz_chirps_monthly, cbind(-9999,NA))

#extract layer names
layer_names <- names(Tz_chirps_monthly)
layer_names
##  [1] "chirps-v2.0.2020.11" "chirps-v2.0.2020.12" "chirps-v2.0.2021.01"
##  [4] "chirps-v2.0.2021.02" "chirps-v2.0.2021.03" "chirps-v2.0.2021.04"
##  [7] "chirps-v2.0.2021.05" "chirps-v2.0.2021.06" "chirps-v2.0.2021.07"
## [10] "chirps-v2.0.2021.08" "chirps-v2.0.2021.09" "chirps-v2.0.2021.10"
## [13] "chirps-v2.0.2021.11" "chirps-v2.0.2021.12" "chirps-v2.0.2022.01"
## [16] "chirps-v2.0.2022.02" "chirps-v2.0.2022.03" "chirps-v2.0.2022.04"
## [19] "chirps-v2.0.2022.05" "chirps-v2.0.2022.06" "chirps-v2.0.2022.07"
## [22] "chirps-v2.0.2022.08" "chirps-v2.0.2022.09" "chirps-v2.0.2022.10"
## [25] "chirps-v2.0.2022.11" "chirps-v2.0.2022.12" "chirps-v2.0.2023.01"
## [28] "chirps-v2.0.2023.02" "chirps-v2.0.2023.03" "chirps-v2.0.2023.04"
## [31] "chirps-v2.0.2023.05" "chirps-v2.0.2023.06" "chirps-v2.0.2023.07"
## [34] "chirps-v2.0.2023.08" "chirps-v2.0.2023.09" "chirps-v2.0.2023.10"
## [37] "chirps-v2.0.2023.11" "chirps-v2.0.2023.12" "chirps-v2.0.2024.01"
## [40] "chirps-v2.0.2024.02" "chirps-v2.0.2024.03" "chirps-v2.0.2024.04"
<<<<<<< HEAD
## [43] "chirps-v2.0.2024.05" "chirps-v2.0.2024.06" "chirps-v2.0.2024.07"
======= ## [43] "chirps-v2.0.2024.05" "chirps-v2.0.2024.06" >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
# We need to create a sequence of dates from the layer names
# Extract year and month from layer names and convert to Date
dates <- as.Date(paste0(sub("chirps-v2.0\\.", "", layer_names), "-01"), format = "%Y.%m-%d")
dates
##  [1] "2020-11-01" "2020-12-01" "2021-01-01" "2021-02-01" "2021-03-01"
##  [6] "2021-04-01" "2021-05-01" "2021-06-01" "2021-07-01" "2021-08-01"
## [11] "2021-09-01" "2021-10-01" "2021-11-01" "2021-12-01" "2022-01-01"
## [16] "2022-02-01" "2022-03-01" "2022-04-01" "2022-05-01" "2022-06-01"
## [21] "2022-07-01" "2022-08-01" "2022-09-01" "2022-10-01" "2022-11-01"
## [26] "2022-12-01" "2023-01-01" "2023-02-01" "2023-03-01" "2023-04-01"
## [31] "2023-05-01" "2023-06-01" "2023-07-01" "2023-08-01" "2023-09-01"
## [36] "2023-10-01" "2023-11-01" "2023-12-01" "2024-01-01" "2024-02-01"
<<<<<<< HEAD
## [41] "2024-03-01" "2024-04-01" "2024-05-01" "2024-06-01" "2024-07-01"
======= ## [41] "2024-03-01" "2024-04-01" "2024-05-01" "2024-06-01" >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
# Assign these dates to the SpatRaster object
time(Tz_chirps_monthly) <- dates

#rename the layers to the formatted dates
names(Tz_chirps_monthly) <- dates

# Check the SpatRaster object
print(Tz_chirps_monthly)
## class       : SpatRaster 
<<<<<<< HEAD
## dimensions  : 215, 222, 45  (nrow, ncol, nlyr)
=======
## dimensions  : 215, 222, 44  (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution  : 0.05, 0.05  (x, y)
## extent      : 29.35, 40.45, -11.75, -1.000001  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## varname     : Tz_chirps_monthly_croped 
## names       : 2020-11-01, 2020-12-01,  2021-01-01,  2021-02-01,  2021-03-01, 2021-04-01, ... 
## min values  :   5.859746,   1.903993,   0.8161677,   0.3187093,   0.9539621,    1.17571, ... 
## max values  : 459.711212, 504.332947, 508.4479980, 585.3240967, 750.1949463,  960.33765, ... 
<<<<<<< HEAD
## time (days) : 2020-11-01 to 2024-07-01
======= ## time (days) : 2020-11-01 to 2024-06-01 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
# do a focal mean of 100km radius - this is about 0.9 of a decimal degree... 0.9009*112=100.9008
# Calculate the focal mean for each layer (month)
fm_r <- focalMat(Tz_chirps_monthly, d=0.9, type='circle', fillNA=FALSE)
Rainfall_focal_sum_100km <- focal(Tz_chirps_monthly, w=fm_r, fun="mean", na.policy="all", fillvalue=NA, na.rm=TRUE,
                                  expand=TRUE, silent=FALSE)
# Check the result
Rainfall_focal_sum_100km
## class       : SpatRaster 
<<<<<<< HEAD
## dimensions  : 215, 222, 45  (nrow, ncol, nlyr)
=======
## dimensions  : 215, 222, 44  (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution  : 0.05, 0.05  (x, y)
## extent      : 29.35, 40.45, -11.75, -1.000001  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## varname     : Tz_chirps_monthly_croped 
## names       : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ... 
## min values  :   20.79649,   27.27837,   8.092316,   4.489223,   11.30184,   15.74836, ... 
## max values  :  313.14248,  326.88227, 416.590468, 395.422070,  464.67840,  569.81990, ... 
<<<<<<< HEAD
## time (days) : 2020-11-01 to 2024-07-01
======= ## time (days) : 2020-11-01 to 2024-06-01 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
Rainfall <- Rainfall_focal_sum_100km
#Resample 
Rainfall_res <- resample(Rainfall, r)
Rainfall_res
## class       : SpatRaster 
<<<<<<< HEAD
## dimensions  : 144, 144, 45  (nrow, ncol, nlyr)
=======
## dimensions  : 144, 144, 44  (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution  : 0.08333333, 0.08333333  (x, y)
## extent      : 29, 41, -12, 0  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ... 
## min values  :   20.89972,    32.9472,   8.116516,   4.527868,   11.83634,   15.90436, ... 
## max values  :  310.41034,   326.4339, 416.389160, 395.367279,  462.19266,  550.26270, ... 
<<<<<<< HEAD
## time (days) : 2020-11-01 to 2024-07-01
======= ## time (days) : 2020-11-01 to 2024-06-01 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Define a function to calculate the 6-month lagged sum of rainfall values.

Here we define function that calculates 6 month lag sum of rainfall for each month in our data. The output is raster datsets.

calculate_lagged_sum <- function(raster_stack, num_months = 6) {
  # Get the time vector from the raster stack
  time_vector <- time(raster_stack)
  
  # Initialize list to store lagged sum rasters
  lagged_sum_rasters <- vector("list", length(time_vector))
  
  # Loop through each layer in the raster stack
  for (i in seq_along(time_vector)) {
    if (i > num_months) {  # We need at least 'num_months' previous layers to calculate the lagged sum
      # Determine the start and end dates for the lag period
      end_date <- time_vector[i] # Date of the current layer being processed
      start_date <- end_date %m-% months(num_months) #The date num_months before the end_date
      
      # Select the layers that fall within the lag period
      lag_period_layers <- raster_stack[[which(time_vector > start_date & time_vector <= end_date)]]
      
      # Calculate the sum of the selected layers
      if (nlyr(lag_period_layers) == num_months) {
        lagged_sum_rasters[[i]] <- sum(lag_period_layers, na.rm = TRUE)
      } else {
        lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack), 
                                        crs = crs(raster_stack), ext = ext(raster_stack), 
                                        vals = NA)  # Use an empty raster with NA values
      }
    } else {
      lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack), 
                                      crs = crs(raster_stack), ext = ext(raster_stack), 
                                      vals = NA)  # Use an empty raster with NA values
    }
  }
  
  # Combine the lagged sum rasters into a single raster stack, excluding empty rasters
  lagged_sum_stack <- rast(lagged_sum_rasters)
  
  # Set the names for the layers in the lagged sum stack
  names(lagged_sum_stack) <- names(raster_stack)[!is.na(lagged_sum_rasters)]
  
  return(lagged_sum_stack)
}

# Calculate the 6-month lagged sum for each period in the raster stack
lagged_rainfall_sum <- calculate_lagged_sum(Rainfall_res, num_months = 6)

# Remove the first 6 layers from the raster stack since they are empty
lagged_rainfall_sum_filtered <- lagged_rainfall_sum[[7:nlyr(lagged_rainfall_sum)]]
# check the result
print(lagged_rainfall_sum_filtered)
## class       : SpatRaster 
<<<<<<< HEAD
## dimensions  : 144, 144, 39  (nrow, ncol, nlyr)
=======
## dimensions  : 144, 144, 38  (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution  : 0.08333333, 0.08333333  (x, y)
## extent      : 29, 41, -12, 0  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       : 2021-05-01, 2021-06-01, 2021-07-01, 2021-08-01, 2021-09-01, 2021-10-01, ... 
## min values  :   172.7856,   105.5447,   105.4121,   105.1864,   26.20571,   7.070483, ... 
## max values  :  1512.4689,  1355.9669,  1052.9220,  1051.9214, 1038.13874, 643.716021, ...
names(lagged_rainfall_sum_filtered) <- paste0(names(lagged_rainfall_sum_filtered), "_rain.sum.lag")

plot(lagged_rainfall_sum_filtered)

## We'll have to include lagged_rainfall_sum_filtered in the predictor stack.
rstack
## class       : SpatRaster 
## dimensions  : 144, 144, 26  (nrow, ncol, nlyr)
## resolution  : 0.08333333, 0.08333333  (x, y)
## extent      : 29, 41, -12, 0  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       :    ttcity_u5,  ttport_1,     bio_1,     bio_2,    bio_3,     bio_4, ... 
## min values  : 3.350081e-02,  997.9427,  4.333545,  6.546645, 53.85881,  19.75255, ... 
## max values  : 1.166560e+03, 3043.3459, 28.472235, 15.285786, 93.14099, 266.33594, ...
names(rstack)
##  [1] "ttcity_u5" "ttport_1"  "bio_1"     "bio_2"     "bio_3"     "bio_4"    
##  [7] "bio_5"     "bio_6"     "bio_7"     "bio_8"     "bio_9"     "bio_10"   
## [13] "bio_11"    "bio_12"    "bio_13"    "bio_14"    "bio_15"    "bio_16"   
## [19] "bio_17"    "bio_18"    "bio_19"    "MAI_ARE"   "MAI_YLD"   "popdens"  
## [25] "latitude"  "longitude"
rstack3 <- c(rstack, lagged_rainfall_sum_filtered)
names(rstack3)
##  [1] "ttcity_u5"               "ttport_1"               
##  [3] "bio_1"                   "bio_2"                  
##  [5] "bio_3"                   "bio_4"                  
##  [7] "bio_5"                   "bio_6"                  
##  [9] "bio_7"                   "bio_8"                  
## [11] "bio_9"                   "bio_10"                 
## [13] "bio_11"                  "bio_12"                 
## [15] "bio_13"                  "bio_14"                 
## [17] "bio_15"                  "bio_16"                 
## [19] "bio_17"                  "bio_18"                 
## [21] "bio_19"                  "MAI_ARE"                
## [23] "MAI_YLD"                 "popdens"                
## [25] "latitude"                "longitude"              
## [27] "2021-05-01_rain.sum.lag" "2021-06-01_rain.sum.lag"
## [29] "2021-07-01_rain.sum.lag" "2021-08-01_rain.sum.lag"
## [31] "2021-09-01_rain.sum.lag" "2021-10-01_rain.sum.lag"
## [33] "2021-11-01_rain.sum.lag" "2021-12-01_rain.sum.lag"
## [35] "2022-01-01_rain.sum.lag" "2022-02-01_rain.sum.lag"
## [37] "2022-03-01_rain.sum.lag" "2022-04-01_rain.sum.lag"
## [39] "2022-05-01_rain.sum.lag" "2022-06-01_rain.sum.lag"
## [41] "2022-07-01_rain.sum.lag" "2022-08-01_rain.sum.lag"
## [43] "2022-09-01_rain.sum.lag" "2022-10-01_rain.sum.lag"
## [45] "2022-11-01_rain.sum.lag" "2022-12-01_rain.sum.lag"
## [47] "2023-01-01_rain.sum.lag" "2023-02-01_rain.sum.lag"
## [49] "2023-03-01_rain.sum.lag" "2023-04-01_rain.sum.lag"
## [51] "2023-05-01_rain.sum.lag" "2023-06-01_rain.sum.lag"
## [53] "2023-07-01_rain.sum.lag" "2023-08-01_rain.sum.lag"
## [55] "2023-09-01_rain.sum.lag" "2023-10-01_rain.sum.lag"
## [57] "2023-11-01_rain.sum.lag" "2023-12-01_rain.sum.lag"
## [59] "2024-01-01_rain.sum.lag" "2024-02-01_rain.sum.lag"
## [61] "2024-03-01_rain.sum.lag" "2024-04-01_rain.sum.lag"
<<<<<<< HEAD
## [63] "2024-05-01_rain.sum.lag" "2024-06-01_rain.sum.lag"
## [65] "2024-07-01_rain.sum.lag"
======= ## [63] "2024-05-01_rain.sum.lag" "2024-06-01_rain.sum.lag" >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
#Extract to the point dataset
extr1 <- terra::extract(rstack3, mypts, method = "bilinear")
mypts <- cbind(mypts, extr1)
# Remove the ID column from the dataset
mypts <- mypts[, !names(mypts) %in% "ID"]
head(mypts)
##          Region   Market Month Year Latitude Longitude  Crop    pkg maize rice
## 1        Arusha   Arusha     5 2021 -3.36968  36.68808 Maize 469.00     1    0
## 2 Dar es Salaam   Temeke     5 2021 -6.85097  39.25672 Maize 485.00     1    0
## 3        Dodoma  Majengo     5 2021 -6.17971  35.74109 Maize 501.50     1    0
## 4        Dodoma Kibaigwa     5 2021 -6.08110  36.64645 Maize 375.00     1    0
## 5        Kagera   Bukoba     5 2021 -1.33140  31.81293 Maize 426.25     1    0
## 6       Manyara   Babati     5 2021 -4.21006  35.74915 Maize 375.00     1    0
##   sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 2       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 3       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 4       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 5       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 6       0       0       0     0     0      0   0   0   0   0   1   0   0   0
##   sep oct nov dec   ttcity_u5 ttport_1    bio_1     bio_2    bio_3     bio_4
## 1   0   0   0   0   6.5704844 1405.327 19.27741 10.934885 68.03365 169.49426
## 2   0   0   0   0   0.5137705 1692.300 25.93087  8.970697 67.33757 153.60644
## 3   0   0   0   0  11.0485593 1752.489 22.37704 12.001499 69.67643 153.46371
## 4   0   0   0   0  83.7810446 1788.311 20.99961 12.303768 70.91631 155.25267
## 5   0   0   0   0  17.1729083 2006.820 20.84789  8.869465 83.92802  38.70414
## 6   0   0   0   0 133.5790574 1526.875 19.98732 10.788815 71.03883 157.18046
##      bio_5    bio_6    bio_7    bio_8    bio_9   bio_10   bio_11    bio_12
## 1 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375  882.1826
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572  582.4022
## 4 29.32247 11.97287 17.34961 22.51508 19.21250 22.53832 18.93717  649.1973
## 5 25.68171 15.11708 10.56463 21.07716 20.26752 21.19770 20.26752 1905.0292
## 6 27.34785 12.16727 15.18059 21.20938 18.19802 21.26028 17.67068  768.0339
##     bio_13      bio_14    bio_15   bio_16       bio_17   bio_18       bio_19
## 1 237.8160  7.36089814  92.10601 482.4729  24.95659498 276.6105  32.88688141
## 2 264.3072 28.32696315  76.22544 594.2531  90.46537640 268.0798 105.37909361
## 3 133.9309  0.01660535 115.12207 375.4677   0.08605806 284.0681   0.08605806
## 4 129.1152  0.21909479 100.22821 368.5410   2.45863517 320.1588   4.15077049
## 5 346.9308 43.30159629  58.29730 857.7995 171.49365306 631.5838 171.49365306
## 6 149.2006  0.91639794  89.44063 381.6907   6.74877687 325.6114  12.35086025
##        MAI_ARE      MAI_YLD   popdens latitude longitude
## 1 3.107393e+02 1.528850e+02  994.5706 -3.36968  36.68808
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097  39.25672
## 3 1.283795e-04 9.139631e-03  352.5410 -6.17971  35.74109
## 4 1.489781e+03 3.828877e+02  101.0565 -6.08110  36.64645
## 5 6.446327e+02 1.834932e+03  280.9803 -1.33140  31.81293
## 6 5.496486e+02 1.917829e+03  219.1393 -4.21006  35.74915
##   2021-05-01_rain.sum.lag 2021-06-01_rain.sum.lag 2021-07-01_rain.sum.lag
## 1                541.4019                482.1614                435.6677
## 2                809.3149                777.8958                722.6213
## 3                637.1213                535.1131                365.9182
## 4                656.0751                567.7319                416.4267
## 5               1149.9932                937.6197                853.1074
## 6                576.1356                490.7897                395.8152
##   2021-08-01_rain.sum.lag 2021-09-01_rain.sum.lag 2021-10-01_rain.sum.lag
## 1                386.3060               320.00676               161.93116
## 2                649.9489               604.59631               337.22359
## 3                183.7701                52.54273                15.42844
## 4                277.1030               153.91512                66.15258
## 5                811.6691               763.24715               615.07899
## 6                289.9778               154.11524                68.29101
##   2021-11-01_rain.sum.lag 2021-12-01_rain.sum.lag 2022-01-01_rain.sum.lag
## 1               101.83447               179.80052                232.6533
## 2               216.69636               288.29559                406.3529
## 3                19.66440                70.12616                416.5519
## 4                49.56118               106.72427                413.7123
## 5               488.90052               605.55387                808.6724
## 6                46.47620               108.69646                245.5580
##   2022-02-01_rain.sum.lag 2022-03-01_rain.sum.lag 2022-04-01_rain.sum.lag
## 1                330.6131                383.2848                540.3073
## 2                490.8561                534.5386                763.3206
## 3                817.0083                917.8080               1000.7087
## 4                665.6779                764.1859                927.5299
## 5                899.5895                957.6149               1138.6710
## 6                441.5905                510.5650                641.9617
##   2022-05-01_rain.sum.lag 2022-06-01_rain.sum.lag 2022-07-01_rain.sum.lag
## 1                530.1299                455.6274                403.9131
## 2                816.5829                729.6579                636.5197
## 3                997.3701                947.9434                601.5170
## 4                958.6012                903.3041                595.9210
## 5               1219.1735               1115.3495                905.1190
## 6                643.2443                581.5847                444.9141
##   2022-08-01_rain.sum.lag 2022-09-01_rain.sum.lag 2022-10-01_rain.sum.lag
## 1                305.5547                252.9373               100.53888
## 2                552.9257                495.7899               268.19916
## 3                201.0583                100.2519                14.64072
## 4                344.1166                244.4002                79.55153
## 5                853.5636                777.4608               541.86704
## 6                248.8724                179.9022                48.53265
##   2022-11-01_rain.sum.lag 2022-12-01_rain.sum.lag 2023-01-01_rain.sum.lag
## 1               118.72860                197.4710                209.2468
## 2               249.91483                366.5402                424.9013
## 3                24.33803                210.4590                364.0985
## 4                63.76563                237.8550                386.3680
## 5               469.63839                598.3139                701.7206
## 6                63.04448                191.9892                242.6080
##   2023-02-01_rain.sum.lag 2023-03-01_rain.sum.lag 2023-04-01_rain.sum.lag
## 1                215.1493                279.3411                509.3222
## 2                470.6298                548.0837                839.2660
## 3                459.6726                564.8242                612.5806
## 4                488.0359                587.2738                698.1311
## 5                667.8848                832.3617                977.3868
## 6                281.6344                387.7473                521.9884
##   2023-05-01_rain.sum.lag 2023-06-01_rain.sum.lag 2023-07-01_rain.sum.lag
## 1                506.2685                448.0443                434.9563
## 2                867.6821                794.6204                712.8424
## 3                603.2209                416.3525                262.7130
## 4                703.3618                531.7645                383.2904
## 5                939.2760                855.6117                744.2889
## 6                509.0339                383.3928                332.5788
##   2023-08-01_rain.sum.lag 2023-09-01_rain.sum.lag 2023-10-01_rain.sum.lag
## 1                430.3774               366.50878               151.77184
## 2                671.7647               605.81817               375.59469
## 3                167.1400                61.98845                14.85932
## 4                281.9979               182.81237                74.83833
## 5                701.5641               576.55398               478.95586
## 6                293.7072               187.82502                55.57329
##   2023-11-01_rain.sum.lag 2023-12-01_rain.sum.lag 2024-01-01_rain.sum.lag
## 1               265.66469                299.4620                435.0957
## 2               742.63087                865.3863               1088.0394
## 3                61.64418                263.3634                560.4381
## 4               140.44970                310.9307                640.9717
## 5               677.93283                840.0104                941.7438
## 6               137.00931                245.0625                448.6259
##   2024-02-01_rain.sum.lag 2024-03-01_rain.sum.lag 2024-04-01_rain.sum.lag
## 1                531.3442                668.3936                896.7951
## 2               1108.6897               1277.2697               1582.9947
## 3                754.5153                856.2409                923.1968
## 4                804.6320                935.2507               1026.7195
## 5               1023.2543               1002.7834               1183.9751
## 6                603.8745                714.4857                863.0247
<<<<<<< HEAD
##   2024-05-01_rain.sum.lag 2024-06-01_rain.sum.lag 2024-07-01_rain.sum.lag
## 1                821.3999                770.3854                635.6426
## 2               1235.6993               1103.4509                878.0843
## 3                875.1995                673.5048                376.4371
## 4                951.3725                780.1154                449.4857
## 5               1003.6453                792.8721                747.1512
## 6                793.5998                682.0563                478.5240
======= ## 2024-05-01_rain.sum.lag 2024-06-01_rain.sum.lag ## 1 821.3999 770.3854 ## 2 1235.6993 1103.4509 ## 3 875.1995 673.5048 ## 4 951.3725 780.1154 ## 5 1003.6453 792.8721 ## 6 793.5998 682.0563 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Here we extract sum of lag rainfall for each row under the column rain.sum.lag

mypts_df <- as.data.frame(mypts)

# Define the function to obtain sum of lag rainfall from corresponding rasters to mypts under rain.sum.lag column (for each row)
# Each extraction has to match the month and year
get_rain_sum_row <- function(current_date, mypts_row) {
  # Extract the rainfall value for the current date
  rain_sum <- mypts_row[[paste0(current_date, "_rain.sum.lag")]]
  return(rain_sum)
}

# Loop through each row and obtain the rainfall sum for each month and year
for (i in 1:nrow(mypts_df)) {
  # Extract relevant data for the current row
  month <- mypts_df$Month[i]
  year <- mypts_df$Year[i]
  current_date <- paste0(year, "-", sprintf("%02d", month), "-01")  # Format date correctly
  # Pass necessary data to the function
  rain_sum <- get_rain_sum_row(current_date, mypts_df[i, ])
  # Update the rain.sum.lag column
  mypts_df$rain.sum.lag[i] <- rain_sum
}

# Update the SpatVector with the new rain.avg column
mypts$rain.sum.lag <- mypts_df$rain.sum.lag
# I'll drop the dates with rain.sum.lag from mypts, seems redundant
column_indices <- grep("^202[0-4]-", names(mypts))
mypts <- mypts[, -column_indices]
names(mypts)
##  [1] "Region"       "Market"       "Month"        "Year"         "Latitude"    
##  [6] "Longitude"    "Crop"         "pkg"          "maize"        "rice"        
## [11] "sorghum"      "bmillet"      "fmillet"      "wheat"        "beans"       
## [16] "potato"       "jan"          "feb"          "mar"          "apr"         
## [21] "may"          "jun"          "jul"          "aug"          "sep"         
## [26] "oct"          "nov"          "dec"          "ttcity_u5"    "ttport_1"    
## [31] "bio_1"        "bio_2"        "bio_3"        "bio_4"        "bio_5"       
## [36] "bio_6"        "bio_7"        "bio_8"        "bio_9"        "bio_10"      
## [41] "bio_11"       "bio_12"       "bio_13"       "bio_14"       "bio_15"      
## [46] "bio_16"       "bio_17"       "bio_18"       "bio_19"       "MAI_ARE"     
## [51] "MAI_YLD"      "popdens"      "latitude"     "longitude"    "rain.sum.lag"
#define Month as a factor
#mypts$Month <- as.factor(mypts$Month)
#levels(mypts$Month)

#We'll define month as an interger instead.
# Check to make sure Month is interger
sapply(mypts, class)
##       Region       Market        Month         Year     Latitude    Longitude 
##  "character"  "character"    "integer"    "integer"    "numeric"    "numeric" 
##         Crop          pkg        maize         rice      sorghum      bmillet 
##     "factor"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##      fmillet        wheat        beans       potato          jan          feb 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##          mar          apr          may          jun          jul          aug 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##          sep          oct          nov          dec    ttcity_u5     ttport_1 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##        bio_1        bio_2        bio_3        bio_4        bio_5        bio_6 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##        bio_7        bio_8        bio_9       bio_10       bio_11       bio_12 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##       bio_13       bio_14       bio_15       bio_16       bio_17       bio_18 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##       bio_19      MAI_ARE      MAI_YLD      popdens     latitude    longitude 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
## rain.sum.lag 
##    "numeric"
# drop levels that don't exist in Crop field
mypts$Crop <- mypts$Crop[,drop=TRUE]
levels(mypts$Crop)
## [1] "Maize"    "Rice"     "Sorghum"  "B.Millet" "F.Millet" "Wheat"    "Beans"   
## [8] "Potato"

Linear model price Prediction

Coefficient estimates from the linear model provide a detailed insight into the relationship between each predictor and the response variable.

# Fit the linear model
lm_model <- lm(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
                 Month +
                 Year + 
                 ttcity_u5 + ttport_1 + 
                 bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + 
                 bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14                  + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 +
                 MAI_ARE + MAI_YLD + 
                 Longitude + Latitude + 
                 rain.sum.lag,
               data = mypts)

summary(lm_model)
## 
## Call:
## lm(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + 
##     wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 + 
##     bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 + 
##     bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + 
##     bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude + 
##     Latitude + rain.sum.lag, data = mypts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
<<<<<<< HEAD
## -1459.51  -243.70   -13.83   238.66  1620.17 
## 
## Coefficients: (2 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4.233e+05  1.191e+04 -35.529  < 2e-16 ***
## maize        -6.523e+01  1.844e+01  -3.537 0.000408 ***
## rice          1.381e+03  1.844e+01  74.904  < 2e-16 ***
## sorghum       3.849e+02  1.950e+01  19.743  < 2e-16 ***
## bmillet       4.723e+02  2.091e+01  22.580  < 2e-16 ***
## fmillet       7.840e+02  1.925e+01  40.735  < 2e-16 ***
## wheat         9.450e+02  2.114e+01  44.707  < 2e-16 ***
## beans         1.518e+03  1.846e+01  82.214  < 2e-16 ***
## potato               NA         NA      NA       NA    
## Month         4.276e+00  1.898e+00   2.253 0.024292 *  
## Year          2.013e+02  5.651e+00  35.622  < 2e-16 ***
## ttcity_u5     1.860e+00  1.595e-01  11.661  < 2e-16 ***
## ttport_1     -1.884e+00  1.697e-01 -11.103  < 2e-16 ***
## bio_1        -2.381e+03  1.913e+02 -12.445  < 2e-16 ***
## bio_2        -2.564e+03  2.390e+02 -10.726  < 2e-16 ***
## bio_3         2.211e+02  1.924e+01  11.493  < 2e-16 ***
## bio_4         2.042e+01  4.048e+00   5.046 4.65e-07 ***
## bio_5         1.899e+03  1.843e+02  10.305  < 2e-16 ***
## bio_6        -1.860e+03  1.952e+02  -9.527  < 2e-16 ***
## bio_7                NA         NA      NA       NA    
## bio_8         8.773e+02  9.767e+01   8.982  < 2e-16 ***
## bio_9        -4.409e+01  8.071e+01  -0.546 0.584883    
## bio_10       -1.086e+03  2.162e+02  -5.024 5.21e-07 ***
## bio_11        2.628e+03  3.105e+02   8.465  < 2e-16 ***
## bio_12        1.087e+00  2.744e-01   3.963 7.50e-05 ***
## bio_13        2.895e-01  9.237e-01   0.313 0.753959    
## bio_14        5.356e+01  1.231e+01   4.352 1.37e-05 ***
## bio_15        2.023e+01  2.103e+00   9.620  < 2e-16 ***
## bio_16       -1.176e+00  7.576e-01  -1.553 0.120493    
## bio_17       -1.533e+01  3.553e+00  -4.315 1.62e-05 ***
## bio_18        1.488e-01  2.500e-01   0.595 0.551679    
## bio_19        3.072e+00  2.777e+00   1.106 0.268608    
## MAI_ARE      -1.173e-02  3.274e-02  -0.358 0.720176    
## MAI_YLD       1.315e-01  1.745e-02   7.537 5.54e-14 ***
## Longitude     7.918e+01  3.568e+01   2.219 0.026521 *  
## Latitude      9.897e-01  3.509e+01   0.028 0.977500    
## rain.sum.lag -2.107e-01  1.945e-02 -10.835  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 379 on 5898 degrees of freedom
## Multiple R-squared:  0.7341, Adjusted R-squared:  0.7325 
## F-statistic: 478.9 on 34 and 5898 DF,  p-value: < 2.2e-16
======= ## -1491.81 -239.93 -10.63 235.20 1602.93 ## ## Coefficients: (2 not defined because of singularities) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -4.592e+05 1.221e+04 -37.593 < 2e-16 *** ## maize -5.132e+01 1.850e+01 -2.774 0.005549 ** ## rice 1.397e+03 1.850e+01 75.482 < 2e-16 *** ## sorghum 3.847e+02 1.957e+01 19.660 < 2e-16 *** ## bmillet 4.745e+02 2.103e+01 22.556 < 2e-16 *** ## fmillet 7.905e+02 1.933e+01 40.885 < 2e-16 *** ## wheat 9.527e+02 2.126e+01 44.821 < 2e-16 *** ## beans 1.516e+03 1.852e+01 81.852 < 2e-16 *** ## potato NA NA NA NA ## Month 6.676e+00 1.886e+00 3.540 0.000403 *** ## Year 2.186e+02 5.854e+00 37.343 < 2e-16 *** ## ttcity_u5 1.881e+00 1.588e-01 11.850 < 2e-16 *** ## ttport_1 -1.918e+00 1.685e-01 -11.388 < 2e-16 *** ## bio_1 -2.372e+03 1.891e+02 -12.542 < 2e-16 *** ## bio_2 -2.675e+03 2.351e+02 -11.377 < 2e-16 *** ## bio_3 2.262e+02 1.925e+01 11.747 < 2e-16 *** ## bio_4 2.172e+01 4.038e+00 5.380 7.73e-08 *** ## bio_5 1.992e+03 1.793e+02 11.107 < 2e-16 *** ## bio_6 -1.950e+03 1.923e+02 -10.139 < 2e-16 *** ## bio_7 NA NA NA NA ## bio_8 9.181e+02 9.258e+01 9.917 < 2e-16 *** ## bio_9 -6.182e+01 7.954e+01 -0.777 0.437097 ## bio_10 -1.244e+03 2.052e+02 -6.061 1.43e-09 *** ## bio_11 2.754e+03 3.005e+02 9.165 < 2e-16 *** ## bio_12 1.155e+00 2.783e-01 4.150 3.38e-05 *** ## bio_13 -7.293e-02 9.213e-01 -0.079 0.936911 ## bio_14 5.699e+01 1.259e+01 4.526 6.14e-06 *** ## bio_15 2.089e+01 2.133e+00 9.796 < 2e-16 *** ## bio_16 -1.014e+00 7.661e-01 -1.323 0.185851 ## bio_17 -1.791e+01 3.588e+00 -4.991 6.19e-07 *** ## bio_18 1.459e-01 2.512e-01 0.581 0.561329 ## bio_19 4.667e+00 2.700e+00 1.729 0.083933 . ## MAI_ARE -2.628e-02 3.250e-02 -0.808 0.418874 ## MAI_YLD 1.432e-01 1.704e-02 8.406 < 2e-16 *** ## Longitude 9.447e+01 3.530e+01 2.676 0.007466 ** ## Latitude 1.300e+01 3.535e+01 0.368 0.713134 ## rain.sum.lag -2.220e-01 1.938e-02 -11.453 < 2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 374.6 on 5718 degrees of freedom ## Multiple R-squared: 0.7403, Adjusted R-squared: 0.7388 ## F-statistic: 479.5 on 34 and 5718 DF, p-value: < 2.2e-16 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

RandomForest and TPS

Random Forest price prediction

First check if there are any predictors with NA values

for(column in seq_along(mypts)){
  if(any(is.na(mypts[column]))){
    print(paste0("Column: ", colnames(mypts)[column], " has at least one NA value"))
  }
}

#There are no columns with missing values

Split data the to be used for Training and validation

Data from May 2021 - Dec 2023 will be used for model training while more recent data from Jan - June 2024 will be used for Validation.

# Filter the data for training (May 2021 - Dec 2023)
training_data <- mypts[mypts$Year %in% c(2021, 2022, 2023), ]
# Check training data
<<<<<<< HEAD
#head(training_data)
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
#head(validation_data)
======= head(training_data)
##          Region   Market Month Year Latitude Longitude  Crop    pkg maize rice
## 1        Arusha   Arusha     5 2021 -3.36968  36.68808 Maize 469.00     1    0
## 2 Dar es Salaam   Temeke     5 2021 -6.85097  39.25672 Maize 485.00     1    0
## 3        Dodoma  Majengo     5 2021 -6.17971  35.74109 Maize 501.50     1    0
## 4        Dodoma Kibaigwa     5 2021 -6.08110  36.64645 Maize 375.00     1    0
## 5        Kagera   Bukoba     5 2021 -1.33140  31.81293 Maize 426.25     1    0
## 6       Manyara   Babati     5 2021 -4.21006  35.74915 Maize 375.00     1    0
##   sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 2       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 3       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 4       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 5       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 6       0       0       0     0     0      0   0   0   0   0   1   0   0   0
##   sep oct nov dec   ttcity_u5 ttport_1    bio_1     bio_2    bio_3     bio_4
## 1   0   0   0   0   6.5704844 1405.327 19.27741 10.934885 68.03365 169.49426
## 2   0   0   0   0   0.5137705 1692.300 25.93087  8.970697 67.33757 153.60644
## 3   0   0   0   0  11.0485593 1752.489 22.37704 12.001499 69.67643 153.46371
## 4   0   0   0   0  83.7810446 1788.311 20.99961 12.303768 70.91631 155.25267
## 5   0   0   0   0  17.1729083 2006.820 20.84789  8.869465 83.92802  38.70414
## 6   0   0   0   0 133.5790574 1526.875 19.98732 10.788815 71.03883 157.18046
##      bio_5    bio_6    bio_7    bio_8    bio_9   bio_10   bio_11    bio_12
## 1 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375  882.1826
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572  582.4022
## 4 29.32247 11.97287 17.34961 22.51508 19.21250 22.53832 18.93717  649.1973
## 5 25.68171 15.11708 10.56463 21.07716 20.26752 21.19770 20.26752 1905.0292
## 6 27.34785 12.16727 15.18059 21.20938 18.19802 21.26028 17.67068  768.0339
##     bio_13      bio_14    bio_15   bio_16       bio_17   bio_18       bio_19
## 1 237.8160  7.36089814  92.10601 482.4729  24.95659498 276.6105  32.88688141
## 2 264.3072 28.32696315  76.22544 594.2531  90.46537640 268.0798 105.37909361
## 3 133.9309  0.01660535 115.12207 375.4677   0.08605806 284.0681   0.08605806
## 4 129.1152  0.21909479 100.22821 368.5410   2.45863517 320.1588   4.15077049
## 5 346.9308 43.30159629  58.29730 857.7995 171.49365306 631.5838 171.49365306
## 6 149.2006  0.91639794  89.44063 381.6907   6.74877687 325.6114  12.35086025
##        MAI_ARE      MAI_YLD   popdens latitude longitude rain.sum.lag
## 1 3.107393e+02 1.528850e+02  994.5706 -3.36968  36.68808     541.4019
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097  39.25672     809.3149
## 3 1.283795e-04 9.139631e-03  352.5410 -6.17971  35.74109     637.1213
## 4 1.489781e+03 3.828877e+02  101.0565 -6.08110  36.64645     656.0751
## 5 6.446327e+02 1.834932e+03  280.9803 -1.33140  31.81293    1149.9932
## 6 5.496486e+02 1.917829e+03  219.1393 -4.21006  35.74915     576.1356
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
head(validation_data)
##          Region  Market Month Year Latitude Longitude  Crop      pkg maize rice
## 1 Dar es Salaam   Ilala     1 2024 -6.82941  39.26289 Maize 775.0000     1    0
## 2 Dar es Salaam  Temeke     1 2024 -6.85097  39.25672 Maize 850.0000     1    0
## 3   Kilimanjaro   Moshi     1 2024 -3.34865  37.34352 Maize 850.0000     1    0
## 4       Singida Singida     1 2024 -4.81261  34.74280 Maize 700.0000     1    0
## 5        Arusha  Arusha     1 2024 -3.36968  36.68808 Maize 709.0909     1    0
## 6        Dodoma Majengo     1 2024 -6.17971  35.74109 Maize 690.4545     1    0
##   sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 2       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 3       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 4       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 5       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 6       0       0       0     0     0      0   1   0   0   0   0   0   0   0
##   sep oct nov dec   ttcity_u5 ttport_1    bio_1     bio_2    bio_3    bio_4
## 1   0   0   0   0   0.4051143 1689.367 25.92678  8.978264 67.42460 152.9108
## 2   0   0   0   0   0.5137705 1692.300 25.93087  8.970697 67.33757 153.6064
## 3   0   0   0   0  11.3442758 1450.131 22.29721 11.005590 67.75448 180.7284
## 4   0   0   0   0 165.2321068 1606.655 20.38367 11.683626 71.57368 137.7943
## 5   0   0   0   0   6.5704844 1405.327 19.27741 10.934885 68.03365 169.4943
## 6   0   0   0   0  11.0485593 1752.489 22.37704 12.001499 69.67643 153.4637
##      bio_5    bio_6    bio_7    bio_8    bio_9   bio_10   bio_11    bio_12
## 1 32.19589 18.88015 13.31574 26.54073 24.05313 27.71579 24.02143 1122.8900
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 31.30237 15.06427 16.23809 22.88764 20.26380 24.25183 19.85315  938.0520
## 4 28.10475 11.78088 16.32386 21.29394 18.36286 21.69755 18.36286  662.1342
## 5 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375  882.1826
## 6 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572  582.4022
##     bio_13      bio_14    bio_15   bio_16      bio_17   bio_18       bio_19
## 1 258.0938 28.00268809  75.59542 582.2198 89.31864258 266.4441 103.38645312
## 2 264.3072 28.32696315  76.22544 594.2531 90.46537640 268.0798 105.37909361
## 3 277.1083 14.38640622  97.50828 541.7736 53.36366989 211.4703  72.58868003
## 4 140.9090  0.00000000 104.78967 387.3265  0.31403068 192.0734   0.31403068
## 5 237.8160  7.36089814  92.10601 482.4729 24.95659498 276.6105  32.88688141
## 6 133.9309  0.01660535 115.12207 375.4677  0.08605806 284.0681   0.08605806
##        MAI_ARE      MAI_YLD   popdens latitude longitude rain.sum.lag
## 1 3.490943e-03 1.426787e-02 8044.9216 -6.82941  39.26289    1085.9139
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097  39.25672    1088.0394
## 3 1.654091e+03 3.030923e+03  522.6697 -3.34865  37.34352     508.5944
## 4 1.631720e+01 1.052680e+03  254.1591 -4.81261  34.74280     613.3819
## 5 3.107393e+02 1.528850e+02  994.5706 -3.36968  36.68808     435.0957
## 6 1.283795e-04 9.139631e-03  352.5410 -6.17971  35.74109     560.4381
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Random Forest for generating variable of importance

Tune The Forest

The tuneRF function in the randomForest package is used to tune the mtry parameter, which is the number of variables randomly sampled as candidates at each split in the random forest. The function requires a data frame of predictor variables and a response variable.

# Convert training_data data to data frame
mypts_df <- as.data.frame(training_data)

trf <- tuneRF(x=mypts_df[,1:ncol(mypts_df)], # Prediction variables
              y=mypts_df$pkg) # Response variable
<<<<<<< HEAD
## mtry = 18  OOB error = 1350.818 
## Searching left ...
## mtry = 9     OOB error = 7006.815 
## -4.187091 0.05 
## Searching right ...
## mtry = 36    OOB error = 155.8048 
## 0.8846589 0.05 
## mtry = 55    OOB error = 91.25705 
## 0.414286 0.05

=======
## mtry = 18  OOB error = 1420.461 
## Searching left ...
## mtry = 9     OOB error = 8080.315 
## -4.688516 0.05 
## Searching right ...
## mtry = 36    OOB error = 136.5892 
## 0.9038417 0.05 
## mtry = 55    OOB error = 107.578 
## 0.2123972 0.05

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Based on the output from tuneRF, you can observe that the mtry value that gives the lowest Out-of-Bag (OOB) error. To build the first random forest model, we will use this mtry value.

(mintree <- trf[which.min(trf[,2]),1])
## [1] 55

Fit The Random Forest Model (1)

Random Forest for generating variable of importance

Here, the model is fitted using the randomForest function to build a predictive model for food commodity prices. The model takes the response variable, the prediction variables and the optimal number of variables to consider at each split. The goal is to generate Variable importance scores which will help us understand which variables have the most significant impact on the response variable, enabling us to interpret the model and possibly simplify it by focusing on the most important predictors.

# Create the random forest model
rf1 <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
                      Month + 
                      Year +
                      ttcity_u5 + ttport_1 + 
                      bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + 
                      bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 +                        bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 + 
                      MAI_ARE + MAI_YLD + 
                      Longitude + Latitude + 
                      rain.sum.lag,
                    data = training_data,mtry=mintree,importance=TRUE,na.rm=TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rf1
## 
## Call:
##  randomForest(formula = pkg ~ maize + rice + sorghum + bmillet +      fmillet + wheat + beans + potato + Month + Year + ttcity_u5 +      ttport_1 + bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +      bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 +      bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE +      MAI_YLD + Longitude + Latitude + rain.sum.lag, data = training_data,      mtry = mintree, importance = TRUE, na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 36
## 
<<<<<<< HEAD
##           Mean of squared residuals: 23992.53
##                     % Var explained: 95.4
varImpPlot(rf1)

## evaluate
(oob <- sqrt(rf1$mse[which.min(rf1$mse)]))
## [1] 154.7101
======= ## Mean of squared residuals: 24098.28 ## % Var explained: 95.38
varImpPlot(rf1)

## evaluate
(oob <- sqrt(rf1$mse[which.min(rf1$mse)]))
## [1] 155.1699
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

This calculates the RMSE of the tree in the Random Forest model that has the lowest OOB error.

importance_metrics <- importance(rf1, type=1)  # %IncMSE
impvar <- rownames(importance_metrics)[order(importance_metrics[, 1], decreasing=TRUE)]
impvar
##  [1] "Year"         "Month"        "beans"        "rice"         "Longitude"   
<<<<<<< HEAD
##  [6] "rain.sum.lag" "fmillet"      "wheat"        "sorghum"      "bmillet"     
## [11] "bio_12"       "bio_18"       "MAI_YLD"      "bio_15"       "MAI_ARE"     
## [16] "bio_3"        "ttcity_u5"    "Latitude"     "ttport_1"     "bio_2"       
## [21] "bio_8"        "bio_4"        "bio_1"        "bio_14"       "bio_5"       
## [26] "bio_7"        "bio_13"       "bio_11"       "bio_16"       "bio_10"      
## [31] "bio_6"        "bio_19"       "maize"        "bio_9"        "potato"      
=======
##  [6] "sorghum"      "rain.sum.lag" "wheat"        "fmillet"      "bmillet"     
## [11] "bio_12"       "bio_18"       "bio_15"       "MAI_YLD"      "ttcity_u5"   
## [16] "Latitude"     "MAI_ARE"      "bio_3"        "ttport_1"     "bio_4"       
## [21] "bio_1"        "bio_8"        "bio_2"        "bio_7"        "bio_13"      
## [26] "bio_5"        "bio_16"       "bio_19"       "bio_14"       "bio_6"       
## [31] "bio_10"       "maize"        "bio_11"       "potato"       "bio_9"       
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## [36] "bio_17"
# Get the top 20 variables
top_20_vars <- impvar[1:20]
top_20_vars
##  [1] "Year"         "Month"        "beans"        "rice"         "Longitude"   
<<<<<<< HEAD
##  [6] "rain.sum.lag" "fmillet"      "wheat"        "sorghum"      "bmillet"     
## [11] "bio_12"       "bio_18"       "MAI_YLD"      "bio_15"       "MAI_ARE"     
## [16] "bio_3"        "ttcity_u5"    "Latitude"     "ttport_1"     "bio_2"
======= ## [6] "sorghum" "rain.sum.lag" "wheat" "fmillet" "bmillet" ## [11] "bio_12" "bio_18" "bio_15" "MAI_YLD" "ttcity_u5" ## [16] "Latitude" "MAI_ARE" "bio_3" "ttport_1" "bio_4" >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
node_purity <- importance(rf1, type=2)  # IncNodePurity
# Sort variables by importance (IncNodePurity)
node_purity_sorted <- sort(node_purity[,1], decreasing = TRUE)
node_purity_sorted
##         rice        beans         Year        wheat      fmillet        Month 
<<<<<<< HEAD
##    537127817    466842514    380388805    131058156    127560634    114732842 
##       potato       bio_12        maize    Longitude rain.sum.lag      sorghum 
##     49729419     45951002     43414090     43009140     40344444     39020394 
##      bmillet        bio_3      MAI_YLD        bio_7       bio_18    ttcity_u5 
##     38185514     27679081     24717292     23928361     17424075     15991756 
##     Latitude      MAI_ARE       bio_15     ttport_1        bio_6        bio_2 
##     14238819     14126778     13630288     12292185      9976545      8630842 
##        bio_5        bio_4        bio_9       bio_11        bio_8       bio_10 
##      7958109      7043691      6035166      5531405      5380741      5294050 
##       bio_19        bio_1       bio_13       bio_16       bio_14       bio_17 
##      5012073      4953527      4828056      4591865      4321322      2834622
======= ## 542294846 475761091 378186164 131333684 130770477 119720507 ## potato bio_12 maize rain.sum.lag Longitude sorghum ## 48284677 48043427 42324325 40567947 39033720 38997850 ## bmillet bio_3 bio_7 MAI_YLD ttcity_u5 bio_18 ## 37845171 28106933 23782771 22171819 20935751 17336321 ## Latitude MAI_ARE bio_15 ttport_1 bio_6 bio_2 ## 13366783 13338376 13288862 12177421 10042208 9327438 ## bio_5 bio_4 bio_9 bio_10 bio_19 bio_8 ## 8280274 6942595 6417617 6178000 5506653 5236076 ## bio_13 bio_1 bio_11 bio_16 bio_14 bio_17 ## 5227136 5169082 4743789 4696699 4618320 3137373 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
# Select the top 20 important variables
top_vars <- names(node_purity_sorted)[1:20]
print(top_vars)
##  [1] "rice"         "beans"        "Year"         "wheat"        "fmillet"     
<<<<<<< HEAD
##  [6] "Month"        "potato"       "bio_12"       "maize"        "Longitude"   
## [11] "rain.sum.lag" "sorghum"      "bmillet"      "bio_3"        "MAI_YLD"     
## [16] "bio_7"        "bio_18"       "ttcity_u5"    "Latitude"     "MAI_ARE"
rf1$importanceSD
##        maize         rice      sorghum      bmillet      fmillet        wheat 
##   1983.76869   2901.76550    571.18768    667.06925   1675.09637   1611.60029 
##        beans       potato        Month         Year    ttcity_u5     ttport_1 
##   3025.10953   2056.80364    275.24757    507.18487    260.09925    234.27023 
##        bio_1        bio_2        bio_3        bio_4        bio_5        bio_6 
##    120.24193    228.55221    590.50143    172.02651    196.30755    305.19412 
##        bio_7        bio_8        bio_9       bio_10       bio_11       bio_12 
##    715.78164    116.03203    290.06653    150.57802    140.41312    823.67181 
##       bio_13       bio_14       bio_15       bio_16       bio_17       bio_18 
##    140.32204     99.90244    259.77001    126.00875    136.64634    274.63460 
##       bio_19      MAI_ARE      MAI_YLD    Longitude     Latitude rain.sum.lag 
##    192.57925    259.25070    421.06640    357.49980    337.54338    172.75824
======= ## [6] "Month" "potato" "bio_12" "maize" "rain.sum.lag" ## [11] "Longitude" "sorghum" "bmillet" "bio_3" "bio_7" ## [16] "MAI_YLD" "ttcity_u5" "bio_18" "Latitude" "MAI_ARE"
rf1$importanceSD
##        maize         rice      sorghum      bmillet      fmillet        wheat 
##    1903.5260    2751.4873     480.8507     616.1691    1642.3436    1571.3900 
##        beans       potato        Month         Year    ttcity_u5     ttport_1 
##    2966.5417    1932.1201     275.9945     532.0094     300.4819     225.4285 
##        bio_1        bio_2        bio_3        bio_4        bio_5        bio_6 
##     112.4326     245.6297     580.2968     147.2850     194.4903     284.8365 
##        bio_7        bio_8        bio_9       bio_10       bio_11       bio_12 
##     689.6151     111.4434     306.9555     178.1498     154.7698     815.7574 
##       bio_13       bio_14       bio_15       bio_16       bio_17       bio_18 
##     128.3945     126.6461     269.3043     125.3841     193.9910     268.8303 
##       bio_19      MAI_ARE      MAI_YLD    Longitude     Latitude rain.sum.lag 
##     186.4857     250.0579     454.9184     340.4519     299.3250     169.0741
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Fit The Random Forest Model (2)

Estimate more parsimonious specification

In this section, we aim to refine our model by selecting the most important variables. We will review the importance metrics (%IncMSE and IncNodePurity) to identify the variables that contribute the most to the model’s predictive power. Our focus will be on variables with higher importance values to ensure a more efficient and interpretable model.

# Estimate more parsimonious specification
rf <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
                     Month +
                     Year + 
                     ttport_1 +
                     bio_3 + bio_6  + bio_9 + bio_12 + 
                     rain.sum.lag, 
                   data=training_data, na.rm=TRUE)

rf
## 
## Call:
##  randomForest(formula = pkg ~ maize + rice + sorghum + bmillet +      fmillet + wheat + beans + potato + Month + Year + ttport_1 +      bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data,      na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
<<<<<<< HEAD
##           Mean of squared residuals: 30081.9
##                     % Var explained: 94.23
# evaluate
varImpPlot(rf)

(oob <- sqrt(rf$mse[which.min(rf$mse)]))
## [1] 172.4226
partialPlot(rf, as.data.frame(training_data), "rain.sum.lag")

======= ## Mean of squared residuals: 29993.92 ## % Var explained: 94.25
# evaluate
varImpPlot(rf)

(oob <- sqrt(rf$mse[which.min(rf$mse)]))
## [1] 173.1372
partialPlot(rf, as.data.frame(training_data), "rain.sum.lag")

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

spatial prediction

These are prediction plots for each food commodities (maize, beans, rice, sorghum, bmillet, fmillet, wheat, potato) with their respective month of the year 2023.

year <- 2023

Note: we must set the rain.sum.lag variables for each month we are predicting

Predicted Maize Prices

#Maize
# Create an empty list to store predictions for maize
predict_for_month <- function(month){
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")] # Remember to change depending on year
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_maize <- data.frame(
    maize = 1, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_maize, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_maize <- lapply(1:12, predict_for_month)


# Extract pixel values from predictions_maize
maize_values <- unlist(lapply(predictions_maize, values))
# Get min and max values
min_maize <- min(maize_values, na.rm = TRUE)
max_maize <- max(maize_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 100 

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 100

# Create a 3x4 matrix of plots
par(mar = c(0, 0, 0, 0))  # Set margins to 0 for inner plots
for (i in 1:12) {
  plot(predictions_maize[[i]], main = paste("Maize prices", toupper(i), year),
       zlim = c(min_maize, max_maize), col = color_palette, breaks = seq(min_maize, max_maize, by = break_interval), legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_maize, max_maize), n = 4)

# Reset plot layout for the legend
layout(matrix(1))
par(mar = c(5, 4, 2, 1))

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_maize, max_maize), legend.only = TRUE,
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9,
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
           legend.args = list(text = "Predicted Maize Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Predicted Beans Prices

# Beans
# Function to predict beans for a given month
predict_for_beans <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"

  newstack <- c(rstack, rain_sum_lag)

  const_beans <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 1, potato = 0,
    Month = month,
    Year = year
  )

  pred <- predict(newstack, rf, const = const_beans, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_beans <- lapply(1:12, predict_for_beans)

# Extract pixel values from predictions_beans
bean_values <- unlist(lapply(predictions_beans, values))
min_bean <- min(bean_values, na.rm = TRUE)
max_bean <- max(bean_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 150 

# Loop through each month to plot beans prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_beans[[i]], main = paste("Beans prices", toupper(i), year), 
       zlim = c(min_bean, max_bean), col = color_palette, breaks = seq(min_bean, max_bean, by = break_interval), legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bean, max_bean), n = 4)

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bean, max_bean), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Beans Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Predicted Rice Prices

# Rice
# Function to predict rice prices for a given month
predict_for_rice <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_rice <- data.frame(
    maize = 0, rice = 1, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_rice, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_rice <- lapply(1:12, predict_for_rice)

# Extract pixel values from predictions_rice
rice_values <- unlist(lapply(predictions_rice, values))
min_rice <- min(rice_values, na.rm = TRUE)
max_rice <- max(rice_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 150 

# Loop through each month to plot rice prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_rice[[i]], main = paste("Rice prices", toupper(i), year), 
       zlim = c(min_rice, max_rice), col = color_palette, breaks = seq(min_rice, max_rice, by = break_interval), legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_rice, max_rice), n = 5)

# Reset plot layout to 1x1 for the legend
layout(matrix(1)) 

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_rice, max_rice), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Rice Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Predicted Sorghum Prices

# Function to predict sorghum prices for a given month
predict_for_sorghum <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_sorghum <- data.frame(
    maize = 0, rice = 0, sorghum = 1, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = 2023
  )
  
  pred <- predict(newstack, rf, const = const_sorghum, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_sorghum <- lapply(1:12, predict_for_sorghum)

# Extract pixel values from predictions_sorghum
sorghum_values <- unlist(lapply(predictions_sorghum, values))
min_sorghum <- min(sorghum_values, na.rm = TRUE)
max_sorghum <- max(sorghum_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 300 

# Loop through each month to plot sorghum prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_sorghum[[i]], main = paste("Sorghum prices", toupper(i), year), 
       zlim = c(min_sorghum, max_sorghum), col = color_palette, breaks = seq(min_sorghum, max_sorghum, by = break_interval), legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_sorghum, max_sorghum), n = 5)

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_sorghum, max_sorghum), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Sorghum Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Predicted Bulrush Millet Prices

# bmillet
# Function to predict bmillet prices for a given month
predict_for_bmillet <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_bmillet <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 1, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_bmillet, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_bmillet <- lapply(1:12, predict_for_bmillet)


# Extract pixel values from predictions_bmillet
bmillet_values <- unlist(lapply(predictions_bmillet, values))
min_bmillet <- min(bmillet_values, na.rm = TRUE)
max_bmillet <- max(bmillet_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

# Loop through each month to plot bmillet prices
break_interval <- 150 
par(mar = c(0, 0, 0, 0)) 
for (i in 1:12) {
  plot(predictions_bmillet[[i]], main = paste("Bmillet prices", toupper(i), year), 
       zlim = c(min_bmillet, max_bmillet), col = color_palette, 
       breaks = seq(min_bmillet, max_bmillet, by = break_interval), 
       legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bmillet, max_bmillet), n = 5) 

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bmillet, max_bmillet), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Bulrush Millet Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Predicted Finger Millet Prices

#fmillet
# Function to predict fmillet prices for a given month
predict_for_fmillet <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_fmillet <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 1, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_fmillet, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_fmillet <- lapply(1:12, predict_for_fmillet)

# Extract pixel values from predictions_fmillet
fmillet_values <- unlist(lapply(predictions_fmillet, values))
min_fmillet <- min(fmillet_values, na.rm = TRUE)
max_fmillet <- max(fmillet_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

# Loop through each month to plot fmillet prices
break_interval <- 300 
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_fmillet[[i]], main = paste("Fmillet prices", toupper(i), year), 
       zlim = c(min_fmillet, max_fmillet), col = color_palette, 
       breaks = seq(min_fmillet, max_fmillet, by = break_interval), 
       legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_fmillet, max_fmillet), n = 3)  

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_fmillet, max_fmillet), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Finger Millet Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Predicted Wheat Prices

#Wheat
# Function to predict wheat prices for a given month
predict_for_wheat <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_wheat <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 1, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_wheat, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_wheat <- lapply(1:12, predict_for_wheat)
# Extract pixel values from predictions_wheat
wheat_values <- unlist(lapply(predictions_wheat, values))
min_wheat <- min(wheat_values, na.rm = TRUE)
max_wheat <- max(wheat_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Define the break interval for both plot and legend
break_interval <- 100

# Loop through each month to plot wheat prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_wheat[[i]], main = paste("Wheat prices", toupper(i), year), 
       zlim = c(min_wheat, max_wheat), col = color_palette, 
       breaks = seq(min_wheat, max_wheat, by = break_interval), 
       legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_wheat, max_wheat), n = 5)

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_wheat, max_wheat), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Wheat Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Predicted potato prices

#potatoes
# Function to predict potato prices for a given month
predict_for_potato <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_potato <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 1,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_potato, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_potato <- lapply(1:12, predict_for_potato)

# Extract pixel values from predictions_potato
potato_values <- unlist(lapply(predictions_potato, values))
min_potato <- min(potato_values, na.rm = TRUE)
max_potato <- max(potato_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

# Loop through each month to plot potato prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_potato[[i]], main = paste("Potato prices", toupper(i), year), 
       zlim = c(min_potato, max_potato), col = color_palette, 
       breaks = seq(min_potato, max_potato, by = break_interval), 
       legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_potato, max_potato), n = 5)  # Adjust n as needed

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_potato, max_potato), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Potato Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Prediction Evaluation

1. Using Validation data

pred<-predict(object=rf, newdata=validation_data)
actual<-validation_data$pkg
result<-data.frame(actual=actual, predicted=pred)
mse <- mean((actual - pred)^2, na.rm=TRUE)
paste('Mean Squared Error:', mse)
<<<<<<< HEAD
## [1] "Mean Squared Error: 142415.415093055"
rmse <- sqrt(mse)
paste('Root Mean Squared error: ',mean(sqrt(rf$mse)))
## [1] "Root Mean Squared error:  176.0013712396"
#Save predicted & observed price
=======
## [1] "Mean Squared Error: 136826.009372833"
rmse <- sqrt(mse)
paste('Root Mean Squared error: ',mean(sqrt(rf$mse)))
## [1] "Root Mean Squared error:  176.664380207372"
#Save predicted & observed yield
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
write.csv(result, "result.csv")
#reading result.csv file (predicted vs observed)
rslt <- read.csv("result.csv", header=T)
print(names(rslt))
## [1] "X"         "actual"    "predicted"
#RMSE predicting from rf - predicited vs observed 
rf.rmse<-round(sqrt(mean( (rslt$actual-rslt$predicted)^2 , na.rm = TRUE )),2)
print(rf.rmse)
<<<<<<< HEAD
## [1] 377.38
#R-square
rf.r2<-round(summary(lm(actual~predicted, rslt))$r.squared,2)
print(rf.r2)
## [1] 0.79
range(actual)
## [1]  350 4050
range(pred)
## [1]  619.2183 3241.6656
=======
## [1] 369.9
#R-square
rf.r2<-round(summary(lm(actual~predicted, rslt))$r.squared,3)
print(rf.r2)
## [1] 0.8
range(actual)
## [1]  350 4050
range(pred)
## [1]  612.277 3245.126
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
#plotting predicted Vs observed
ggplot(result, aes(x=actual, y=predicted), alpha=0.6) +
  geom_point(colour = "blue", size = 1.4, alpha=0.6) +
  ggtitle('Random Forest "Wholesale Grain Prices in Tanzania"') +
  scale_x_continuous("Observed Price (Tsh)",
                     limits = c(0, 5000),
                     breaks = seq(0, 5000, 1000)) +
  scale_y_continuous("Predicted Price (Tsh)",
                     limits = c(0, 5000),
                     breaks = seq(0, 5000, 1000)) +
  theme(axis.line = element_line(colour = "black"),
        axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
        axis.text.x = element_text(size = 8)) +
  geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
  geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
  annotate("text", x = 300, y = 4500, label = paste("RMSE:", rf.rmse)) +
  annotate("text", x = 300, y = 4200, label = paste("R^2: ", rf.r2), parse = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

2. Compare the observed Prices (the training data) with the predicted Prices (predicted using the training data) using stats package

library(stats)
mypts_df$pred <- stats::predict(rf)
rsq <- function (obs, pred) cor(obs, pred, use = 'complete.obs') ^ 2
RMSE <- function(obs, pred){sqrt(mean((pred - obs)^2, na.rm = TRUE))}
fr2_rsq <- rsq(mypts_df$pkg, mypts_df$pred) %>% round(digits = 2)
fr2_rmse <- RMSE(mypts_df$pkg, mypts_df$pred) %>% round(digits = 0)
Price_fit_plot <- ggplot(data = mypts_df, aes(x = pkg, y = pred)) +
  geom_point(colour = "blue", size = 1.4 ,alpha=0.6) + 
  ggtitle('Observed vs Predicted "Wholesale Grain Prices in Tanzania"') +
  geom_abline(slope = 1, alpha=0.3) +
  annotate('text', x = 150, y = 4500, label = paste0("R^{2}==", fr2_rsq), parse = TRUE, size=3)  +
  annotate('text', x = 150, y = 4200, label = paste0("RMSE==", fr2_rmse), parse = TRUE, size=3)  +
  labs(x = "Observed Price (Tsh)", y = "Predicted Price (Tsh)") +
  xlim(0, 5000) + ylim(0, 5000)
Price_fit_plot
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e

Partial dependence plots

Plot Partial dependence of all the variables used except food commodities and months.

library(caret)

var_importance <- varImp(rf)

impvar <- rownames(var_importance)[order(var_importance[, 1], decreasing=TRUE)]
op <- par(mfrow=c(2, 4))
# exclude food commodities and months
predictors_to_plot <- setdiff(impvar, c("maize", "rice", "sorghum", "bmillet", "fmillet", "wheat", "beans", "potato", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))

for (i in seq_along(predictors_to_plot)) {
  partialPlot(rf, as.data.frame(training_data), predictors_to_plot[i], xlab=predictors_to_plot[i],
              main="Partial Dependence")
}
<<<<<<< HEAD

Summarize the price data by commodity

# Determine the number of observations for each commodity
#After removing NAs
crop_counts <- table(mypts$Crop)
crop_counts
## 
##    Maize     Rice  Sorghum B.Millet F.Millet    Wheat    Beans   Potato 
##      859      858      705      559      731      535      854      832
# Create a data frame with crop and count
crop_summary <- data.frame(
  Crop = names(crop_counts),
  Count = as.vector(crop_counts)
)
# Print the crop summary
print(crop_summary)
##       Crop Count
## 1    Maize   859
## 2     Rice   858
## 3  Sorghum   705
## 4 B.Millet   559
## 5 F.Millet   731
## 6    Wheat   535
## 7    Beans   854
## 8   Potato   832

correlation Plots

These are correlation plots for pooled sample in two periods: post-harvest (May-Oct) and lean season (Nov-April).

#We'll use mean Monthly data in wide format
prices.monthly
##             Region     Market Month Year Latitude Longitude mai.price ric.price
##   1:        Arusha     Arusha     5 2021 -3.36968  36.68808  469.0000  1530.000
##   2: Dar es Salaam     Temeke     5 2021 -6.85097  39.25672  485.0000  1650.000
##   3:        Dodoma    Majengo     5 2021 -6.17971  35.74109  501.5000  1592.000
##   4:        Dodoma   Kibaigwa     5 2021 -6.08110  36.64645  375.0000       NaN
##   5:        Kagera     Bukoba     5 2021 -1.33140  31.81293  426.2500  1260.000
##  ---                                                                           
## 860:        Mwanza     Mwanza     7 2024 -2.51969  32.90144  725.0000  2350.000
## 861:         Pwani      Pwani     7 2024 -6.44221  38.90622  835.7143  1946.429
## 862:        Simiyu    Bariadi     7 2024 -2.80355  33.98699  660.0000  1450.000
## 863:        Kigoma     Kigoma     7 2024 -4.89697  29.65987  578.5714  1417.857
## 864:         Rukwa Sumbawanga     7 2024 -7.95239  31.62319  546.7857  1653.571
##      sor.price bul.price fin.price whe.price bea.price pot.price
##   1:   695.000   761.000  1333.000  793.0000  1545.000  745.0000
##   2:   900.000   900.000  1775.000 1230.0000  2420.000  730.0000
##   3:   558.000   581.000  1752.000       NaN  2075.000  618.0000
##   4:   437.500       NaN       NaN       NaN       NaN       NaN
##   5:  1375.000  1337.500  1637.500 1637.5000  1300.000  675.0000
##  ---                                                            
## 860:  1428.571  1421.429  1621.429       NaN  2528.571 1142.8571
## 861:  1442.857  1550.000  1857.143 1975.0000  2967.857 1735.7143
## 862:  1400.000  1739.286  1728.571 2457.1429  2750.000 1500.0000
## 863:  1821.429  1892.857  1907.143 1867.8571  2228.571 1075.0000
## 864:       NaN       NaN   975.000  719.6429  2135.714  641.0714
# Create a function to determine the season
get_season <- function(month) {
  if (month %in% 5:10) {
    return("Post-Harvest")
  } else {
    return("Lean Season")
  }
}
# Add a 'Season' column to the data
prices.monthly$Season <- sapply(prices.monthly$Month, get_season)
# Post Harvest Data
post_harvest_data <- prices.monthly[prices.monthly$Season == "Post-Harvest", ]
# Remove rows with NaNs from the Post-Harvest data
post_harvest_data <- post_harvest_data[complete.cases(post_harvest_data[, 7:14]), ]

# Lean Season data
lean_season_data <- prices.monthly[prices.monthly$Season == "Lean Season", ]
# Remove rows with NaNs from the Lean Season data
lean_season_data <- lean_season_data[complete.cases(lean_season_data[, 7:14]), ]
# Calculate correlation matrix for Post-Harvest season
post_harvest_corr <- cor(post_harvest_data[, 3:14])
# Calculate correlation matrix for Lean Season
lean_season_corr <- cor(lean_season_data[, 3:14])
# Plot correlation matrix for Post-Harvest season
corrplot(post_harvest_corr, 
         method = "color",          
         title = "",                
         tl.col = "black",          
         tl.cex = 0.5,             
         addCoef.col = "black",     
         number.cex = 0.5,         
         number.digits = 2) 

# Add title to the plot
title(main = "Post-Harvest Correlation Matrix", 
      line = 4,                
      cex.main = 0.9)

# Plot correlation matrix for Lean Season
corrplot(lean_season_corr, 
         method = "color", 
         title = "",
         tl.col = "black",       
         tl.cex = 0.5, 
         addCoef.col = "black",
         number.cex = 0.5,
         number.digits = 2)

# Add title to the plot
title(main = "Lean Season Correlation Matrix", 
      line = 4,                
      cex.main = 0.9) 

Crop Specific Price Predictions

We are doing Crop Specific Price Predictions to determine which model performs better at prediction by comparing predictions r2 or rmse ## Maize Price Prediction

# Filter data for maize
mypts_df2 <- as.data.frame(mypts)
mypts_maize <- mypts_df2[mypts_df2$Crop == "Maize", ]
unique(mypts_maize$Crop)
## [1] Maize
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# Harmonize the levels
mypts_maize <- mypts_maize %>%
  mutate(Crop = as.character(Crop)) %>%
  mutate(Crop = factor(Crop))
# check again
unique(mypts_maize$Crop)
## [1] Maize
## Levels: Maize
mypts_maize <- vect(mypts_maize, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# Training data for maize
training_data_maize <- mypts_maize[mypts_maize$Year %in% c(2021, 2022, 2023), ]
# Filter the data for validation (Jan 2024 - July 2024)
validation_data_maize <- mypts_maize[mypts_maize$Year == 2024, ]
# Fit the Random Forest Model
rf_maize <- randomForest(pkg ~ Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, 
                         data = training_data_maize, 
                         na.rm = TRUE)

rf_maize
## 
## Call:
##  randomForest(formula = pkg ~ Month + Year + ttport_1 + bio_3 +      bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data_maize,      na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 12980.05
##                     % Var explained: 84.36
# evaluate
varImpPlot(rf_maize)

Prediction Evaluation Maize Only RF Model

We predict the validation data using the rf model then calculate R^2 and RMSE

pred_maize<-predict(object=rf_maize, newdata=validation_data_maize)
actual_maize<-validation_data_maize$pkg
result_maize<-data.frame(actual=actual_maize, predicted=pred_maize)
#Save predicted & observed yield
write.csv(result_maize, "result_maize.csv")

#reading result_maize.csv file (predicted vs observed)
rslt_m <- read.csv("result_maize.csv", header=T)
print(names(rslt_m))
## [1] "X"         "actual"    "predicted"
#R-square predicting from rf_maize predicited vs observed 
rf_maize.rmse<-round(sqrt(mean( (rslt_m$actual-rslt_m$predicted)^2 , na.rm = TRUE )),2)
print(rf_maize.rmse)
## [1] 325.47
#R-square
rf_maize.r2<-round(summary(lm(actual~predicted, rslt_m))$r.squared,2)
print(rf_maize.r2)
## [1] 0.38
#R-square
rf_maize.r2<-round(summary(lm(actual~predicted, rslt_m))$r.squared,2)
print(rf_maize.r2)
## [1] 0.38
range(actual_maize)
## [1]  350 1250
range(pred_maize, na.rm = TRUE)
## [1]  703.9585 1323.0264
#plotting predicted Vs observed
ggplot(result_maize, aes(x=actual_maize, y=pred_maize), alpha=0.6) +
  geom_point(colour = "blue", size = 1.4, alpha=0.6) +
  ggtitle('Random Forest "Maize Prices in Tanzania"') +
  scale_x_continuous("Observed Price (Tsh)",
                     limits = c(0, 1500),
                     breaks = seq(0, 1500, 500)) +
  scale_y_continuous("Predicted Price (Tsh)",
                     limits = c(0, 1500),
                     breaks = seq(0, 1500, 500)) +
  theme(axis.line = element_line(colour = "black"),
        axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
        axis.text.x = element_text(size = 8)) +
  geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
  geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
  annotate("text", x = 400, y = 1500, label = paste("RMSE:", rf_maize.rmse)) +
  annotate("text", x = 400, y = 1400, label = paste("R^2: ", rf_maize.r2), parse = TRUE)

Beans Price Prediction

# Filter data for Beans
mypts_beans <- mypts_df2[mypts_df2$Crop == "Beans", ]
unique(mypts_beans$Crop)
## [1] Beans
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# Harmonize the levels
mypts_beans <- mypts_beans %>%
  mutate(Crop = as.character(Crop)) %>%
  mutate(Crop = factor(Crop))
# check again
unique(mypts_beans$Crop)
## [1] Beans
## Levels: Beans
mypts_beans <- vect(mypts_beans, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# Training data for beans
training_data_beans <- mypts_beans[mypts_beans$Year %in% c(2021, 2022, 2023), ]

# Validation Data beans
validation_data_beans <- mypts_beans[mypts_beans$Year == 2024, ]
# Fit the Random Forest Model
rf_beans <- randomForest(pkg ~ Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, 
                         data = training_data_beans, 
                         na.rm = TRUE)

rf_beans
## 
## Call:
##  randomForest(formula = pkg ~ Month + Year + ttport_1 + bio_3 +      bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data_beans,      na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 48290.36
##                     % Var explained: 85.92
# evaluate
varImpPlot(rf_beans)

Prediction Evaluation Beans Only RF Model Using Validation data

pred_beans<-predict(object=rf_beans, newdata=validation_data_beans)
actual_beans<-validation_data_beans$pkg
result_beans<-data.frame(actual=actual_beans, predicted=pred_beans)
#Save predicted & observed beans price
write.csv(result_beans, "result_beans.csv")
#reading result_beans.csv file (predicted vs observed)
rslt_b <- read.csv("result_beans.csv", header=T)
print(names(rslt_b))
## [1] "X"         "actual"    "predicted"
#R-square predicting from rf_beans predicited vs observed 
rf_beans.rmse<-round(sqrt(mean( (rslt_b$actual-rslt_b$predicted)^2 , na.rm = TRUE )),2)
print(rf_beans.rmse)
## [1] 348.32
#R-square
rf_beans.r2<-round(summary(lm(actual~predicted, rslt_b))$r.squared,2)
print(rf_beans.r2)
## [1] 0.38
range(actual_beans)
## [1] 1252.5 3850.0
range(pred_beans)
## [1] 1868.940 3287.237
#plotting predicted Vs observed Beans Prices
ggplot(result_beans, aes(x=actual_beans, y=pred_beans), alpha=0.6) +
  geom_point(colour = "blue", size = 1.4, alpha=0.6) +
  ggtitle('Random Forest "Beans Prices in Tanzania"') +
  scale_x_continuous("Observed Price (Tsh)",
                     limits = c(0, 4000),
                     breaks = seq(0, 4000, 1000)) +
  scale_y_continuous("Predicted Price (Tsh)",
                     limits = c(0, 4000),
                     breaks = seq(0, 4000, 1000)) +
  theme(axis.line = element_line(colour = "black"),
        axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
        axis.text.x = element_text(size = 8)) +
  geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
  geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
  annotate("text", x = 1000, y = 3800, label = paste("RMSE:", rf_beans.rmse)) +
  annotate("text", x = 1000, y = 3600, label = paste("R^2: ", rf_beans.r2), parse = TRUE)

Rice Price Prediction

# Filter data for Rice
mypts_rice <- mypts_df2[mypts_df2$Crop == "Rice", ]
unique(mypts_rice$Crop)
## [1] Rice
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# Harmonize the levels
mypts_rice <- mypts_rice %>%
  mutate(Crop = as.character(Crop)) %>%
  mutate(Crop = factor(Crop))
# check again
unique(mypts_rice$Crop)
## [1] Rice
## Levels: Rice
mypts_rice <- vect(mypts_rice, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# Training data for rice
training_data_rice <- mypts_rice[mypts_rice$Year %in% c(2021, 2022, 2023), ]

# Validation Data rice
validation_data_rice <- mypts_rice[mypts_rice$Year == 2024, ]
# Fit the Random Forest Model
rf_rice <- randomForest(pkg ~ Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, 
                         data = training_data_rice, 
                         na.rm = TRUE)

rf_rice
## 
## Call:
##  randomForest(formula = pkg ~ Month + Year + ttport_1 + bio_3 +      bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data_rice,      na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 38862.74
##                     % Var explained: 89.9
# evaluate
varImpPlot(rf_rice)

Prediction Evaluation Rice Only RF Model Using Validation data

pred_rice<-predict(object=rf_rice, newdata=validation_data_rice)
actual_rice<-validation_data_rice$pkg
result_rice<-data.frame(actual=actual_rice, predicted=pred_rice)
#Save predicted & observed rice price
write.csv(result_rice, "result_rice.csv")
#reading result_rice.csv file (predicted vs observed)
rslt_r <- read.csv("result_rice.csv", header=T)
print(names(rslt_r))
## [1] "X"         "actual"    "predicted"
#R-square predicting from rf_rice predicited vs observed 
rf_rice.rmse<-round(sqrt(mean( (rslt_r$actual-rslt_r$predicted)^2 , na.rm = TRUE )),2)
print(rf_rice.rmse)
## [1] 480.29
#R-square
rf_rice.r2<-round(summary(lm(actual~predicted, rslt_r))$r.squared,2)
print(rf_rice.r2)
## [1] 0.29
range(actual_rice)
## [1] 1316.667 3010.000
range(pred_rice)
## [1] 1898.561 3101.155
#plotting predicted Vs observed Rice Prices
ggplot(result_rice, aes(x=actual_rice, y=pred_rice), alpha=0.6) +
  geom_point(colour = "blue", size = 1.4, alpha=0.6) +
  ggtitle('Random Forest "Rice Prices in Tanzania"') +
  scale_x_continuous("Observed Price (Tsh)",
                     limits = c(0, 4000),
                     breaks = seq(0, 4000, 1000)) +
  scale_y_continuous("Predicted Price (Tsh)",
                     limits = c(0, 4000),
                     breaks = seq(0, 4000, 1000)) +
  theme(axis.line = element_line(colour = "black"),
        axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
        axis.text.x = element_text(size = 8)) +
  geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
  geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
  annotate("text", x = 1000, y = 3800, label = paste("RMSE:", rf_rice.rmse)) +
  annotate("text", x = 1000, y = 3600, label = paste("R^2: ", rf_rice.r2), parse = TRUE)

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
<<<<<<< HEAD ======= ======= Predictive mapping of wholesale grain prices for rural areas in Tanzania

Introduction

We are interested in estimating the prices of agricultural food commodities across space and time, on the basis of prices as observed at some market locations. Here we explore methods to model these prices, using monthly data from across Tanzania over the period May 2021-June 2024.

This document describes the process of fitting a Random Forest model to predict these prices. The performance of the Random Forest model is evaluated using Root Mean Square Errors (RMSE) and R-Square as test statistics. We also compare the observed prices with the predicted prices using an out of sample dataset.

Load Libraries

library(geodata)
## Warning: package 'geodata' was built under R version 4.3.3
library(lubridate)
library(terra)
library(data.table)
library(randomForest)
library(httr)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
library(pdp)
## Warning: package 'pdp' was built under R version 4.3.3
library(gridExtra)
library(stats)
library(dplyr)
library(stringr)
library(fields)
## Warning: package 'fields' was built under R version 4.3.3
## Warning: package 'spam' was built under R version 4.3.3

Data

This dataset contains price information for various crops across different regions and markets in Tanzania from 2021 to 2024. The data was acquired from Tanzania’s Ministry of Industry and Trade.

setwd("H:/Tanzania Price data/Datasets")

prices <- fread("Tanzania_Price_Data_AllCrops_with_Coordinates4.csv")
dim(prices)
## [1] 7441   21
head(prices)
##           Region   Market Maize..min.price. Maize..max.price. Rice..min.price.
## 1:        Arusha   Arusha             43000             45000           140000
## 2: Dar es Salaam   Temeke             46000             47000           120000
## 3:        Dodoma  Majengo             45000             50000           132000
## 4:        Dodoma Kibaigwa             30000             42000               NA
## 5:        Kagera   Bukoba             33000             35000           110000
## 6:       Manyara   Babati             30000             45000           120000
##    Rice..max.price. Sorghum..min.price. Sorghum..max.price.
## 1:           170000               60000               65000
## 2:           210000               80000              100000
## 3:           200000               50000               60000
## 4:               NA               45000               48000
## 5:           140000              140000              150000
## 6:           170000               60000               80000
##    Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## 1:                      70000                      75000
## 2:                      80000                     100000
## 3:                      52000                      64000
## 4:                         NA                         NA
## 5:                     120000                     140000
## 6:                      80000                     100000
##    Finger.Millet..min.price. Finger.Millet..max.price. Wheat..min.price.
## 1:                    120000                    125000             70000
## 2:                    175000                    180000            110000
## 3:                    200000                    252000                NA
## 4:                        NA                        NA                NA
## 5:                    170000                    180000            170000
## 6:                    120000                    130000            100000
##    Wheat..max.price. Beans..min.price. Beans..max.price.
## 1:             78000            130000            165000
## 2:            120000            220000            260000
## 3:                NA            200000            245000
## 4:                NA                NA                NA
## 5:            180000            110000            170000
## 6:            120000            150000            180000
##    Irish.Potatoes..min.price. Irish.Potatoes..max.price.     Date Latitude
## 1:                      70000                      75000 5/5/2021 -3.36968
## 2:                      75000                      75000 5/5/2021 -6.85097
## 3:                      56000                      68000 5/5/2021 -6.17971
## 4:                         NA                         NA 5/5/2021 -6.08110
## 5:                      60000                      75000 5/5/2021 -1.33140
## 6:                      60000                      60000 5/5/2021 -4.21006
##    Longitude
## 1:  36.68808
## 2:  39.25672
## 3:  35.74109
## 4:  36.64645
## 5:  31.81293
## 6:  35.74915
table(prices$Market)
## 
##       Arusha       Babati      Bariadi     Buguruni       Bukoba        Geita 
##          371          227           64           10          373           38 
##      Igawilo        Ilala       Iringa     Kibaigwa       Kigoma  Kilimanjaro 
##           24          187          375          208          227           13 
##    Kilombero    Kinondoni        Lindi      Majengo      Manyara        Mbeya 
##           24          202          232          408          115          117 
##     Mgandini      Mnalani     Morogoro        Moshi       Mpanda      Mpimbwe 
##           24            9          369          225          193           39 
##       Mtwara       Musoma Mwananyamala       Mwanza       Namfua       Njombe 
##          384          270           24          333           24          172 
##    Nyankumbu        Pwani    Shinyanga      Singida       Songea       Songwe 
##           24           55          266           83          344           34 
##   Sumbawanga       Tabora      Tandale      Tandika        Tanga       Temeke 
##          368          347           41           50          359          153 
##       Ubungo        Ujiji 
##           32            4
sapply(prices, class)
##                     Region                     Market 
##                "character"                "character" 
##          Maize..min.price.          Maize..max.price. 
##                  "integer"                  "integer" 
##           Rice..min.price.           Rice..max.price. 
##                  "integer"                  "integer" 
##        Sorghum..min.price.        Sorghum..max.price. 
##                  "integer"                  "integer" 
## Bulrush.Millet..min.price. Bulrush.Millet..max.price. 
##                  "integer"                  "integer" 
##  Finger.Millet..min.price.  Finger.Millet..max.price. 
##                  "integer"                  "integer" 
##          Wheat..min.price.          Wheat..max.price. 
##                  "integer"                  "integer" 
##          Beans..min.price.          Beans..max.price. 
##                  "integer"                  "integer" 
## Irish.Potatoes..min.price. Irish.Potatoes..max.price. 
##                  "integer"                  "integer" 
##                       Date                   Latitude 
##                "character"                  "numeric" 
##                  Longitude 
##                  "numeric"
# Convert to date format
prices$Date <- lubridate::mdy(prices$Date)

Basic Data preperation

Check the Region and Market names and coodinates. Make sure the Region and Market names are harmonized and properly geocoded

unique(prices[Region=="Arusha",.(Market, Latitude, Longitude)])
##       Market Latitude Longitude
## 1:    Arusha -3.36968  36.68808
## 2: Kilombero -3.37324  36.67870
unique(prices[Region=="Dar es Salaam",.(Market, Latitude, Longitude)])
##          Market  Latitude Longitude
## 1:       Temeke -6.850970  39.25672
## 2:    Kinondoni -6.784070  39.27007
## 3:        Ilala -6.829410  39.26289
## 4:      Tandika -6.867912  39.25480
## 5:     Buguruni -6.838380  39.24359
## 6:      Tandale -6.795230  39.24085
## 7:       Ubungo -6.793620  39.20966
## 8: Mwananyamala -6.788890  39.25840
unique(prices[Region=="Dodoma",.(Market, Latitude, Longitude)])
##      Market Latitude Longitude
## 1:  Majengo -6.17971  35.74109
## 2: Kibaigwa -6.08110  36.64645
unique(prices[Region=="Kagera",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Bukoba  -1.3314  31.81293
unique(prices[Region=="Manyara",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1:  Babati -4.21006  35.74915
## 2: Manyara -4.46011  37.20217
unique(prices[Region=="Rukwa",.(Market, Latitude, Longitude)])
##        Market Latitude Longitude
## 1: Sumbawanga -7.95239  31.62319
unique(prices[Region=="Mpanda",.(Market, Latitude, Longitude)])
## Empty data.table (0 rows and 3 cols): Market,Latitude,Longitude
unique(prices[Region=="Mtwara",.(Market, Latitude, Longitude)])
##    Market  Latitude Longitude
## 1: Mtwara -10.28009  40.18191
unique(prices[Region=="Tabora",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Tabora -5.01972  32.80767
unique(prices[Region=="Tanga",.(Market, Latitude, Longitude)])
##      Market  Latitude Longitude
## 1:    Tanga -5.074260  39.09993
## 2: Mgandini -5.086235  39.09561
unique(prices[Region=="Iringa",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Iringa -7.78001  35.69671
unique(prices[Region=="Kigoma",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Kigoma -4.89697  29.65987
## 2:  Ujiji -4.90837  29.69203
unique(prices[Region=="Morogoro",.(Market, Latitude, Longitude)])
##      Market Latitude Longitude
## 1: Morogoro -6.82771  37.66542
unique(prices[Region=="Mwanza",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Mwanza -2.51969  32.90144
unique(prices[Region=="Mara",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Musoma -1.49982   33.8083
unique(prices[Region=="Ruvuma",.(Market, Latitude, Longitude)])
##    Market  Latitude Longitude
## 1: Songea -10.67873  35.64836
unique(prices[Region=="Shinyanga",.(Market, Latitude, Longitude)])
##       Market Latitude Longitude
## 1: Shinyanga -3.67226  33.43069
unique(prices[Region=="Kilimanjaro",.(Market, Latitude, Longitude)])
##         Market Latitude Longitude
## 1:       Moshi -3.34865  37.34352
## 2: Kilimanjaro -3.33854  37.32654
unique(prices[Region=="Mbeya",.(Market, Latitude, Longitude)])
##     Market  Latitude Longitude
## 1:   Mbeya -8.909940  33.45517
## 2: Igawilo -8.924246  33.56788
unique(prices[Region=="Katavi",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1:  Mpanda -6.34295  31.07299
## 2: Mpimbwe -7.24425  31.81782
## 3: Majengo -6.34809  31.07055
unique(prices[Region=="Njombe",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Njombe -9.33805  34.76949
unique(prices[Region=="Lindi",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1:  Lindi   -9.995    39.708
unique(prices[Region=="Singida",.(Market, Latitude, Longitude)])
##     Market  Latitude Longitude
## 1: Singida -4.812610   34.7428
## 2:  Namfua -4.821121   34.7470
unique(prices[Region=="Pwani",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1:   Pwani -6.44221  38.90622
## 2: Mnalani -6.44221  38.90622
unique(prices[Region=="Simiyu",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1: Bariadi -2.80355  33.98699
unique(prices[Region=="Geita",.(Market, Latitude, Longitude)])
##       Market  Latitude Longitude
## 1:     Geita -2.870760  32.23408
## 2: Nyankumbu -2.895955  32.22871
unique(prices[Region=="Songwe",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Songwe -8.95179  33.24377
setnames(prices, old = "Maize..min.price.", new = "mai.price.min")
setnames(prices, old = "Rice..min.price.", new = "ric.price.min")
setnames(prices, old = "Sorghum..min.price.", new = "sor.price.min")
setnames(prices, old = "Bulrush.Millet..min.price.", new = "bul.price.min")
setnames(prices, old = "Finger.Millet..min.price.", new = "fin.price.min")
setnames(prices, old = "Wheat..min.price.", new = "whe.price.min")
setnames(prices, old = "Beans..min.price.", new = "bea.price.min")
setnames(prices, old = "Irish.Potatoes..min.price.", new = "pot.price.min")
setnames(prices, old = "Maize..max.price.", new = "mai.price.max")
setnames(prices, old = "Rice..max.price.", new = "ric.price.max")
setnames(prices, old = "Sorghum..max.price.", new = "sor.price.max")
setnames(prices, old = "Bulrush.Millet..max.price.", new = "bul.price.max")
setnames(prices, old = "Finger.Millet..max.price.", new = "fin.price.max")
setnames(prices, old = "Wheat..max.price.", new = "whe.price.max")
setnames(prices, old = "Beans..max.price.", new = "bea.price.max")
setnames(prices, old = "Irish.Potatoes..max.price.", new = "pot.price.max")
sapply(prices, class)
##        Region        Market mai.price.min mai.price.max ric.price.min 
##   "character"   "character"     "integer"     "integer"     "integer" 
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
## bea.price.max pot.price.min pot.price.max          Date      Latitude 
##     "integer"     "integer"     "integer"        "Date"     "numeric" 
##     Longitude 
##     "numeric"
#convert prices to numeric 
prices$mai.price.min <- as.numeric(prices$mai.price.min)
prices$ric.price.min <- as.numeric(prices$ric.price.min)
prices$sor.price.min <- as.numeric(prices$sor.price.min)
prices$bul.price.min <- as.numeric(prices$bul.price.min)
prices$fin.price.min <- as.numeric(prices$fin.price.min)
prices$whe.price.min <- as.numeric(prices$whe.price.min)
prices$bea.price.min <- as.numeric(prices$bea.price.min)
prices$pot.price.min <- as.numeric(prices$pot.price.min)

prices$mai.price.max <- as.numeric(prices$mai.price.max)
prices$ric.price.max <- as.numeric(prices$ric.price.max)
prices$sor.price.max <- as.numeric(prices$sor.price.max)
prices$bul.price.max <- as.numeric(prices$bul.price.max)
prices$fin.price.max <- as.numeric(prices$fin.price.max)
prices$whe.price.max <- as.numeric(prices$whe.price.max)
prices$bea.price.max <- as.numeric(prices$bea.price.max)
prices$pot.price.max <- as.numeric(prices$pot.price.max)

sapply(prices, class)
##        Region        Market mai.price.min mai.price.max ric.price.min 
##   "character"   "character"     "numeric"     "numeric"     "numeric" 
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max 
##     "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min 
##     "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
## bea.price.max pot.price.min pot.price.max          Date      Latitude 
##     "numeric"     "numeric"     "numeric"        "Date"     "numeric" 
##     Longitude 
##     "numeric"
# convert to price per kg
prices$mai.price.min <- prices$mai.price.min/100
prices$ric.price.min <- prices$ric.price.min/100
prices$sor.price.min <- prices$sor.price.min/100
prices$bul.price.min <- prices$bul.price.min/100
prices$fin.price.min <- prices$fin.price.min/100
prices$whe.price.min <- prices$whe.price.min/100
prices$bea.price.min <- prices$bea.price.min/100
prices$pot.price.min <- prices$pot.price.min/100

prices$mai.price.max <- prices$mai.price.max/100
prices$ric.price.max <- prices$ric.price.max/100
prices$sor.price.max <- prices$sor.price.max/100
prices$bul.price.max <- prices$bul.price.max/100
prices$fin.price.max <- prices$fin.price.max/100
prices$whe.price.max <- prices$whe.price.max/100
prices$bea.price.max <- prices$bea.price.max/100
prices$pot.price.max <- prices$pot.price.max/100
# calculate average of min and max
prices$mai.price <- (prices$mai.price.min + prices$mai.price.max) / 2
prices$ric.price <- (prices$ric.price.min + prices$ric.price.max) / 2
prices$sor.price <- (prices$sor.price.min + prices$sor.price.max) / 2
prices$bul.price <- (prices$bul.price.min + prices$bul.price.max) / 2
prices$fin.price <- (prices$fin.price.min + prices$fin.price.max) / 2
prices$whe.price <- (prices$whe.price.min + prices$whe.price.max) / 2
prices$bea.price <- (prices$bea.price.min + prices$bea.price.max) / 2
prices$pot.price <- (prices$pot.price.min + prices$pot.price.max) / 2
#We can add dates by using the year and the month names
prices$Day   <- day(prices$Date)
prices$Month <- month(prices$Date)
prices$Year  <- year(prices$Date)
# drop unneccessary columns
prices <- prices[,!c("mai.price.min", "mai.price.max",
                     "ric.price.min", "ric.price.max",
                     "sor.price.min", "sor.price.max",
                     "bul.price.min", "bul.price.max",
                     "fin.price.min", "fin.price.max",
                     "whe.price.min", "whe.price.max", 
                     "bea.price.min", "bea.price.max", 
                     "pot.price.min", "pot.price.max")]
# calculate monthly mean prices by market 
prices.monthly <- prices[, .(mai.price = mean(mai.price, na.rm = TRUE), 
           ric.price = mean(ric.price, na.rm = TRUE), 
           sor.price = mean(sor.price, na.rm = TRUE), 
           bul.price = mean(bul.price, na.rm = TRUE),
           fin.price = mean(fin.price, na.rm = TRUE), 
           whe.price = mean(whe.price, na.rm = TRUE), 
           bea.price = mean(bea.price, na.rm = TRUE), 
           pot.price = mean(pot.price, na.rm = TRUE)), 
       by=.(Region, Market, Month, Year, Latitude, Longitude)]
# reshape to long (so that prices for different commodities can be simultaneously estimated)
prices.monthly 
##             Region     Market Month Year Latitude Longitude mai.price ric.price
##   1:        Arusha     Arusha     5 2021 -3.36968  36.68808  469.0000  1530.000
##   2: Dar es Salaam     Temeke     5 2021 -6.85097  39.25672  485.0000  1650.000
##   3:        Dodoma    Majengo     5 2021 -6.17971  35.74109  501.5000  1592.000
##   4:        Dodoma   Kibaigwa     5 2021 -6.08110  36.64645  375.0000       NaN
##   5:        Kagera     Bukoba     5 2021 -1.33140  31.81293  426.2500  1260.000
##  ---                                                                           
## 836:        Mwanza     Mwanza     6 2024 -2.51969  32.90144  725.0000  2350.000
## 837:         Pwani    Mnalani     6 2024 -6.44221  38.90622  772.2222  2000.000
## 838:        Simiyu    Bariadi     6 2024 -2.80355  33.98699  563.3333  1316.667
## 839:        Kigoma     Kigoma     6 2024 -4.89697  29.65987  504.2222  1488.889
## 840:         Rukwa Sumbawanga     6 2024 -7.95239  31.62319  533.3333  1750.000
##      sor.price bul.price fin.price whe.price bea.price pot.price
##   1:   695.000   761.000  1333.000  793.0000  1545.000  745.0000
##   2:   900.000   900.000  1775.000 1230.0000  2420.000  730.0000
##   3:   558.000   581.000  1752.000       NaN  2075.000  618.0000
##   4:   437.500       NaN       NaN       NaN       NaN       NaN
##   5:  1375.000  1337.500  1637.500 1637.5000  1300.000  675.0000
##  ---                                                            
## 836:  1400.000  1350.000  1550.000       NaN  2400.000 1000.0000
## 837:  1522.222  1544.444  2022.222 2400.0000  3116.667 1800.0000
## 838:  1300.000  1788.889  1716.667 2316.6667  2805.556 1500.0000
## 839:  1266.667  1266.667  1433.333 2122.2222  2216.667 1127.7778
## 840:       NaN       NaN  1088.889  780.5556  1888.889  652.7778
prices.monthly.long <- melt(prices.monthly, id.vars=c('Region', 'Market', 'Month', 'Year', 'Latitude', 'Longitude'),)
# rename columns
setnames(prices.monthly.long, old="variable", new="Crop")
setnames(prices.monthly.long, old="value", new="pkg")
# replace crop names
prices.monthly.long[Crop == "mai.price", Crop := "Maize"]
prices.monthly.long[Crop == "ric.price", Crop := "Rice"]
prices.monthly.long[Crop == "sor.price", Crop := "Sorghum"]
prices.monthly.long[Crop == "bul.price", Crop := "B.Millet"]
prices.monthly.long[Crop == "fin.price", Crop := "F.Millet"]
prices.monthly.long[Crop == "whe.price", Crop := "Wheat"]
prices.monthly.long[Crop == "bea.price", Crop := "Beans"]
prices.monthly.long[Crop == "pot.price", Crop := "Potato"]
# Reset the factor levels to updated levels
prices.monthly.long[, Crop := factor(Crop)]
# Check the unique values again
unique(prices.monthly.long$Crop)
## [1] Maize    Rice     Sorghum  B.Millet F.Millet Wheat    Beans    Potato  
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# generate dummies to use in place of factors (for later spatial predictions, which are struggling with factors)
prices.monthly.long[, maize   := ifelse(Crop == "Maize",1,0)]
prices.monthly.long[, rice    := ifelse(Crop == "Rice",1,0)]
prices.monthly.long[, sorghum := ifelse(Crop == "Sorghum",1,0)]
prices.monthly.long[, bmillet := ifelse(Crop == "B.Millet",1,0)]
prices.monthly.long[, fmillet := ifelse(Crop == "F.Millet",1,0)]
prices.monthly.long[, wheat   := ifelse(Crop == "Wheat",1,0)]
prices.monthly.long[, beans   := ifelse(Crop == "Beans",1,0)]
prices.monthly.long[, potato  := ifelse(Crop == "Potato",1,0)]
prices.monthly.long[, jan := ifelse(Month == 1  , 1, 0)]
prices.monthly.long[, feb := ifelse(Month == 2  , 1, 0)]
prices.monthly.long[, mar := ifelse(Month == 3  , 1, 0)]
prices.monthly.long[, apr := ifelse(Month == 4  , 1, 0)]
prices.monthly.long[, may := ifelse(Month == 5  , 1, 0)]
prices.monthly.long[, jun := ifelse(Month == 6  , 1, 0)]
prices.monthly.long[, jul := ifelse(Month == 7  , 1, 0)]
prices.monthly.long[, aug := ifelse(Month == 8  , 1, 0)]
prices.monthly.long[, sep := ifelse(Month == 9  , 1, 0)]
prices.monthly.long[, oct := ifelse(Month == 10 , 1, 0)]
prices.monthly.long[, nov := ifelse(Month == 11 , 1, 0)]
prices.monthly.long[, dec := ifelse(Month == 12 , 1, 0)]
# replace NaN with NAs in the price observations
prices.monthly.long[is.nan(pkg), pkg := NA]
# Remove observations with missing observations
prices.monthly.long <- na.omit(prices.monthly.long)
# bring in raster stack as predictors
geodata_path("H:/Tanzania Price data/Datasets/geodata")
list.files("H:/Tanzania Price data/Datasets/geodata", recursive=TRUE)
##  [1] "climate/wc2.1_country/TZA_wc2.1_30s_bio.tif"
##  [2] "travel/travel_time_to_cities_u5.tif"        
##  [3] "TRUE/gadm/gadm41_TZA_0_pk.rds"              
##  [4] "TRUE/gadm/gadm41_TZA_1_pk.rds"              
##  [5] "TRUE/gadm/gadm41_TZA_2_pk.rds"              
##  [6] "TRUE/gadm/gadm41_TZA_3_pk.rds"              
##  [7] "TRUE/spam/spam2017v2r1_ssa_area.zip"        
##  [8] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif"    
##  [9] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_H.tif"    
## [10] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_I.tif"    
## [11] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_L.tif"    
## [12] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_R.tif"    
## [13] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_S.tif"    
## [14] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_A.tif"    
## [15] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_H.tif"    
## [16] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_I.tif"    
## [17] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_L.tif"    
## [18] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif"    
## [19] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_S.tif"    
## [20] "TRUE/spam/spam2017v2r1_ssa_yield.zip"       
## [21] "TRUE/travel/travel_time_to_cities_u5.tif"   
## [22] "TRUE/travel/travel_time_to_ports_1.tif"     
## [23] "TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif"
# tza0 <- gadm(country="TZA", level=0)
# tza1 <- gadm(country="TZA", level=1)
# tza2 <- gadm(country="TZA", level=2)
# tza3 <- gadm(country="TZA", level=3)

tza0 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_0_pk.rds")
tza1 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_1_pk.rds")
tza2 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_2_pk.rds")
tza3 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_3_pk.rds")
# convert prices observations to vector for mapping
mypts <- vect(prices.monthly.long, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)

# see if these show up correctly
plot(tza1)
plot(mypts, col="Red", add=TRUE)

# text(mypts, label="Market")
# create reference raster
tza_extent <- ext(tza1) |> floor()
r <- crop(rast(res=1/12), tza_extent)

Interpolate

## Interpolate

#xy <- as.matrix(mypts[,c("Longitude", "Latitude")])
xy <- geom(mypts)[,c("y","x")]
#tps <- Tps(xy, p$spatial)
tps <- Tps(xy, mypts$pkg)
sp <- interpolate(r, tps)
sp <- mask(sp, tza1)
plot(sp)
lines(tza1)

Predict prices with coodinates only

rf <- randomForest(pkg ~ Longitude + Latitude , data=mypts)
sp3 <- interpolate(r, rf, xyNames=c("Longitude", "Latitude"))
sp3 <- mask(sp3, tza1)
plot(sp3)
lines(tza1)

Covariates

The covariates used include a mix of crop-specific indicators, temporal variables to capture monthly and yearly effects, geographical coordinates, accessibility measures, climatic conditions, and lagged rainfall to account for delayed effects of weather on crop prices.

ttcity <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_cities_u5.tif")
ttport <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_ports_1.tif")
clm    <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif")
area   <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif")
yield  <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif")
popd  <- rast("gpw_v4_population_density_rev11_2020_10m.tif")
names(ttcity) <- c("ttcity_u5") ## travel time cities of 100k or more
names(ttport) <- c("ttport_1") ## travel time to major ports
names(clm) <- gsub("wc2.1_30s_", "", names(clm))
names(area) <- c("MAI_ARE") # SPAM maize area 2010
names(yield)  <- c("MAI_YLD") # SPAM maize yield 2010
names(popd)  <- c("popdens") # GPW4

comment(ttcity) <- "travel time to cities 100k or more"
comment(ttport) <- "travel time to major ports"

comment(popd) <- "population density 2020 (GPW4 @ 10dm)"

comment(clm)[1] <-"BIO1 = Annual Mean Temperature"
comment(clm)[2] <-"BIO2 = Mean Diurnal Range (Mean of monthly (max temp - min temp))"
comment(clm)[3] <-"BIO3 = Isothermality (BIO2/BIO7) (×100)"
comment(clm)[4] <-"BIO4 = Temperature Seasonality (standard deviation ×100)"
comment(clm)[5] <-"BIO5 = Max Temperature of Warmest Month"
comment(clm)[6] <-"BIO6 = Min Temperature of Coldest Month"
comment(clm)[7] <-"BIO7 = Temperature Annual Range (BIO5-BIO6)"
comment(clm)[8] <-"BIO8 = Mean Temperature of Wettest Quarter"
comment(clm)[9] <-"BIO9 = Mean Temperature of Driest Quarter"
comment(clm)[10] <-"BIO10 = Mean Temperature of Warmest Quarter"
comment(clm)[11] <-"BIO11 = Mean Temperature of Coldest Quarter"
comment(clm)[12] <-"BIO12 = Annual Precipitation"
comment(clm)[13] <-"BIO13 = Precipitation of Wettest Month"
comment(clm)[14] <-"BIO14 = Precipitation of Driest Month"
comment(clm)[15] <-"BIO15 = Precipitation Seasonality (Coefficient of Variation)"
comment(clm)[16] <-"BIO16 = Precipitation of Wettest Quarter"
comment(clm)[17] <-"BIO17 = Precipitation of Driest Quarter"
comment(clm)[18] <-"BIO18 = Precipitation of Warmest Quarter"
comment(clm)[19] <-"BIO19 = Precipitation of Coldest Quarter"

Harmonize rasters to national boundaries and common resolution

ttcity <- resample(ttcity, r)
ttport <- resample(ttport, r)
clm    <- resample(clm, r)
area   <- resample(area, r)
popd   <- resample(popd, r)
freq(is.na(area))
##   layer value count
## 1     1     0 14393
## 2     1     1  6343
area <- classify(area, cbind(NA,0)) 
yield  <- resample(yield, r)
freq(is.na(yield))
##   layer value count
## 1     1     0 14393
## 2     1     1  6343
yield <- classify(yield, cbind(NA,0)) 
# check again 
compareGeom(ttcity, ttport, clm, area, yield, popd)
## [1] TRUE

Generate Latitude and Longitude grid

latgrd <- longrd <- r
latgrd[] <- yFromCell(latgrd, 1:ncell(latgrd))
longrd[] <- xFromCell(longrd, 1:ncell(longrd))
names(latgrd) <- c("latitude")
names(longrd) <- c("longitude")

Prepare Predictor Stack

rstack <- c(ttcity, ttport, clm, area, yield, popd, latgrd, longrd)
names(rstack)
##  [1] "ttcity_u5" "ttport_1"  "bio_1"     "bio_2"     "bio_3"     "bio_4"    
##  [7] "bio_5"     "bio_6"     "bio_7"     "bio_8"     "bio_9"     "bio_10"   
## [13] "bio_11"    "bio_12"    "bio_13"    "bio_14"    "bio_15"    "bio_16"   
## [19] "bio_17"    "bio_18"    "bio_19"    "MAI_ARE"   "MAI_YLD"   "popdens"  
## [25] "latitude"  "longitude"
# create focal mean to extract from (as alternative to using buffers for extraction, which are not supported in terra)
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack, w=fm, fun="mean", na.policy="all", fillvalue=NA, # na.rm=TRUE,
                 expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE) 
# extract values to dataset -- use a 20km buffer
# do a focal sum of 20km radius  - this is about 0.18 of a decimal degree... 0.18*112=20.16
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack2, w=fm, fun="sum", na.policy="all", fillvalue=NA, na.rm=TRUE,
                 expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE) 

Lag Rainfall

Bring in Rainfall Data from Chirps

chirps_path <- "H:/Tanzania Price data/chirps_data"

chirps_files <- list.files(chirps_path, pattern = ".tif$", full.names = TRUE)

# Read all CHIRPS data files into a SpatRaster collection
chirps_rasters <- rast(chirps_files)

#crop to Tanzania boundary
Chirps_Tz <- crop(chirps_rasters, tza1)

writeRaster(Chirps_Tz, "Tz_chirps_monthly_croped.tif", overwrite=TRUE)

Tz_chirps_monthly <- terra::rast("Tz_chirps_monthly_croped.tif")
Tz_chirps_monthly
## class       : SpatRaster 
## dimensions  : 215, 222, 44  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 29.35, 40.45, -11.75, -1.000001  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source      : Tz_chirps_monthly_croped.tif 
## names       : chirp~20.11, chirp~20.12, chirp~21.01, chirp~21.02, chirp~21.03, chirp~21.04, ... 
## min values  :  -9999.0000,  -9999.0000,   -9999.000,  -9999.0000,  -9999.0000,  -9999.0000, ... 
## max values  :    459.7112,    504.3329,     508.448,    585.3241,    750.1949,    960.3376, ...
#Replace -9999 with NA
Tz_chirps_monthly <- classify(Tz_chirps_monthly, cbind(-9999,NA))

#extract layer names
layer_names <- names(Tz_chirps_monthly)
layer_names
##  [1] "chirps-v2.0.2020.11" "chirps-v2.0.2020.12" "chirps-v2.0.2021.01"
##  [4] "chirps-v2.0.2021.02" "chirps-v2.0.2021.03" "chirps-v2.0.2021.04"
##  [7] "chirps-v2.0.2021.05" "chirps-v2.0.2021.06" "chirps-v2.0.2021.07"
## [10] "chirps-v2.0.2021.08" "chirps-v2.0.2021.09" "chirps-v2.0.2021.10"
## [13] "chirps-v2.0.2021.11" "chirps-v2.0.2021.12" "chirps-v2.0.2022.01"
## [16] "chirps-v2.0.2022.02" "chirps-v2.0.2022.03" "chirps-v2.0.2022.04"
## [19] "chirps-v2.0.2022.05" "chirps-v2.0.2022.06" "chirps-v2.0.2022.07"
## [22] "chirps-v2.0.2022.08" "chirps-v2.0.2022.09" "chirps-v2.0.2022.10"
## [25] "chirps-v2.0.2022.11" "chirps-v2.0.2022.12" "chirps-v2.0.2023.01"
## [28] "chirps-v2.0.2023.02" "chirps-v2.0.2023.03" "chirps-v2.0.2023.04"
## [31] "chirps-v2.0.2023.05" "chirps-v2.0.2023.06" "chirps-v2.0.2023.07"
## [34] "chirps-v2.0.2023.08" "chirps-v2.0.2023.09" "chirps-v2.0.2023.10"
## [37] "chirps-v2.0.2023.11" "chirps-v2.0.2023.12" "chirps-v2.0.2024.01"
## [40] "chirps-v2.0.2024.02" "chirps-v2.0.2024.03" "chirps-v2.0.2024.04"
## [43] "chirps-v2.0.2024.05" "chirps-v2.0.2024.06"
# We need to create a sequence of dates from the layer names
# Extract year and month from layer names and convert to Date
dates <- as.Date(paste0(sub("chirps-v2.0\\.", "", layer_names), "-01"), format = "%Y.%m-%d")
dates
##  [1] "2020-11-01" "2020-12-01" "2021-01-01" "2021-02-01" "2021-03-01"
##  [6] "2021-04-01" "2021-05-01" "2021-06-01" "2021-07-01" "2021-08-01"
## [11] "2021-09-01" "2021-10-01" "2021-11-01" "2021-12-01" "2022-01-01"
## [16] "2022-02-01" "2022-03-01" "2022-04-01" "2022-05-01" "2022-06-01"
## [21] "2022-07-01" "2022-08-01" "2022-09-01" "2022-10-01" "2022-11-01"
## [26] "2022-12-01" "2023-01-01" "2023-02-01" "2023-03-01" "2023-04-01"
## [31] "2023-05-01" "2023-06-01" "2023-07-01" "2023-08-01" "2023-09-01"
## [36] "2023-10-01" "2023-11-01" "2023-12-01" "2024-01-01" "2024-02-01"
## [41] "2024-03-01" "2024-04-01" "2024-05-01" "2024-06-01"
# Assign these dates to the SpatRaster object
time(Tz_chirps_monthly) <- dates

#rename the layers to the formatted dates
names(Tz_chirps_monthly) <- dates

# Check the SpatRaster object
print(Tz_chirps_monthly)
## class       : SpatRaster 
## dimensions  : 215, 222, 44  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 29.35, 40.45, -11.75, -1.000001  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## varname     : Tz_chirps_monthly_croped 
## names       : 2020-11-01, 2020-12-01,  2021-01-01,  2021-02-01,  2021-03-01, 2021-04-01, ... 
## min values  :   5.859746,   1.903993,   0.8161677,   0.3187093,   0.9539621,    1.17571, ... 
## max values  : 459.711212, 504.332947, 508.4479980, 585.3240967, 750.1949463,  960.33765, ... 
## time (days) : 2020-11-01 to 2024-06-01
# do a focal mean of 100km radius - this is about 0.9 of a decimal degree... 0.9009*112=100.9008
# Calculate the focal mean for each layer (month)
fm_r <- focalMat(Tz_chirps_monthly, d=0.9, type='circle', fillNA=FALSE)
Rainfall_focal_sum_100km <- focal(Tz_chirps_monthly, w=fm_r, fun="mean", na.policy="all", fillvalue=NA, na.rm=TRUE,
                                  expand=TRUE, silent=FALSE)
# Check the result
Rainfall_focal_sum_100km
## class       : SpatRaster 
## dimensions  : 215, 222, 44  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 29.35, 40.45, -11.75, -1.000001  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## varname     : Tz_chirps_monthly_croped 
## names       : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ... 
## min values  :   20.79649,   27.27837,   8.092316,   4.489223,   11.30184,   15.74836, ... 
## max values  :  313.14248,  326.88227, 416.590468, 395.422070,  464.67840,  569.81990, ... 
## time (days) : 2020-11-01 to 2024-06-01
Rainfall <- Rainfall_focal_sum_100km
#Resample 
Rainfall_res <- resample(Rainfall, r)
Rainfall_res
## class       : SpatRaster 
## dimensions  : 144, 144, 44  (nrow, ncol, nlyr)
## resolution  : 0.08333333, 0.08333333  (x, y)
## extent      : 29, 41, -12, 0  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ... 
## min values  :   20.89972,    32.9472,   8.116516,   4.527868,   11.83634,   15.90436, ... 
## max values  :  310.41034,   326.4339, 416.389160, 395.367279,  462.19266,  550.26270, ... 
## time (days) : 2020-11-01 to 2024-06-01

Define a function to calculate the 6-month lagged sum of rainfall values.

Here we define function that calculates 6 month lag sum of rainfall for each month in our data. The output is raster datsets.

calculate_lagged_sum <- function(raster_stack, num_months = 6) {
  # Get the time vector from the raster stack
  time_vector <- time(raster_stack)
  
  # Initialize list to store lagged sum rasters
  lagged_sum_rasters <- vector("list", length(time_vector))
  
  # Loop through each layer in the raster stack
  for (i in seq_along(time_vector)) {
    if (i > num_months) {  # We need at least 'num_months' previous layers to calculate the lagged sum
      # Determine the start and end dates for the lag period
      end_date <- time_vector[i] # Date of the current layer being processed
      start_date <- end_date %m-% months(num_months) #The date num_months before the end_date
      
      # Select the layers that fall within the lag period
      lag_period_layers <- raster_stack[[which(time_vector > start_date & time_vector <= end_date)]]
      
      # Calculate the sum of the selected layers
      if (nlyr(lag_period_layers) == num_months) {
        lagged_sum_rasters[[i]] <- sum(lag_period_layers, na.rm = TRUE)
      } else {
        lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack), 
                                        crs = crs(raster_stack), ext = ext(raster_stack), 
                                        vals = NA)  # Use an empty raster with NA values
      }
    } else {
      lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack), 
                                      crs = crs(raster_stack), ext = ext(raster_stack), 
                                      vals = NA)  # Use an empty raster with NA values
    }
  }
  
  # Combine the lagged sum rasters into a single raster stack, excluding empty rasters
  lagged_sum_stack <- rast(lagged_sum_rasters)
  
  # Set the names for the layers in the lagged sum stack
  names(lagged_sum_stack) <- names(raster_stack)[!is.na(lagged_sum_rasters)]
  
  return(lagged_sum_stack)
}

# Calculate the 6-month lagged sum for each period in the raster stack
lagged_rainfall_sum <- calculate_lagged_sum(Rainfall_res, num_months = 6)

# Remove the first 6 layers from the raster stack since they are empty
lagged_rainfall_sum_filtered <- lagged_rainfall_sum[[7:nlyr(lagged_rainfall_sum)]]
# check the result
print(lagged_rainfall_sum_filtered)
## class       : SpatRaster 
## dimensions  : 144, 144, 38  (nrow, ncol, nlyr)
## resolution  : 0.08333333, 0.08333333  (x, y)
## extent      : 29, 41, -12, 0  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       : 2021-05-01, 2021-06-01, 2021-07-01, 2021-08-01, 2021-09-01, 2021-10-01, ... 
## min values  :   172.7856,   105.5447,   105.4121,   105.1864,   26.20571,   7.070483, ... 
## max values  :  1512.4689,  1355.9669,  1052.9220,  1051.9214, 1038.13874, 643.716021, ...
names(lagged_rainfall_sum_filtered) <- paste0(names(lagged_rainfall_sum_filtered), "_rain.sum.lag")

plot(lagged_rainfall_sum_filtered)

## We'll have to include lagged_rainfall_sum_filtered in the predictor stack.
rstack
## class       : SpatRaster 
## dimensions  : 144, 144, 26  (nrow, ncol, nlyr)
## resolution  : 0.08333333, 0.08333333  (x, y)
## extent      : 29, 41, -12, 0  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       :    ttcity_u5,  ttport_1,     bio_1,     bio_2,    bio_3,     bio_4, ... 
## min values  : 3.350081e-02,  997.9427,  4.333545,  6.546645, 53.85881,  19.75255, ... 
## max values  : 1.166560e+03, 3043.3459, 28.472235, 15.285786, 93.14099, 266.33594, ...
names(rstack)
##  [1] "ttcity_u5" "ttport_1"  "bio_1"     "bio_2"     "bio_3"     "bio_4"    
##  [7] "bio_5"     "bio_6"     "bio_7"     "bio_8"     "bio_9"     "bio_10"   
## [13] "bio_11"    "bio_12"    "bio_13"    "bio_14"    "bio_15"    "bio_16"   
## [19] "bio_17"    "bio_18"    "bio_19"    "MAI_ARE"   "MAI_YLD"   "popdens"  
## [25] "latitude"  "longitude"
rstack3 <- c(rstack, lagged_rainfall_sum_filtered)
names(rstack3)
##  [1] "ttcity_u5"               "ttport_1"               
##  [3] "bio_1"                   "bio_2"                  
##  [5] "bio_3"                   "bio_4"                  
##  [7] "bio_5"                   "bio_6"                  
##  [9] "bio_7"                   "bio_8"                  
## [11] "bio_9"                   "bio_10"                 
## [13] "bio_11"                  "bio_12"                 
## [15] "bio_13"                  "bio_14"                 
## [17] "bio_15"                  "bio_16"                 
## [19] "bio_17"                  "bio_18"                 
## [21] "bio_19"                  "MAI_ARE"                
## [23] "MAI_YLD"                 "popdens"                
## [25] "latitude"                "longitude"              
## [27] "2021-05-01_rain.sum.lag" "2021-06-01_rain.sum.lag"
## [29] "2021-07-01_rain.sum.lag" "2021-08-01_rain.sum.lag"
## [31] "2021-09-01_rain.sum.lag" "2021-10-01_rain.sum.lag"
## [33] "2021-11-01_rain.sum.lag" "2021-12-01_rain.sum.lag"
## [35] "2022-01-01_rain.sum.lag" "2022-02-01_rain.sum.lag"
## [37] "2022-03-01_rain.sum.lag" "2022-04-01_rain.sum.lag"
## [39] "2022-05-01_rain.sum.lag" "2022-06-01_rain.sum.lag"
## [41] "2022-07-01_rain.sum.lag" "2022-08-01_rain.sum.lag"
## [43] "2022-09-01_rain.sum.lag" "2022-10-01_rain.sum.lag"
## [45] "2022-11-01_rain.sum.lag" "2022-12-01_rain.sum.lag"
## [47] "2023-01-01_rain.sum.lag" "2023-02-01_rain.sum.lag"
## [49] "2023-03-01_rain.sum.lag" "2023-04-01_rain.sum.lag"
## [51] "2023-05-01_rain.sum.lag" "2023-06-01_rain.sum.lag"
## [53] "2023-07-01_rain.sum.lag" "2023-08-01_rain.sum.lag"
## [55] "2023-09-01_rain.sum.lag" "2023-10-01_rain.sum.lag"
## [57] "2023-11-01_rain.sum.lag" "2023-12-01_rain.sum.lag"
## [59] "2024-01-01_rain.sum.lag" "2024-02-01_rain.sum.lag"
## [61] "2024-03-01_rain.sum.lag" "2024-04-01_rain.sum.lag"
## [63] "2024-05-01_rain.sum.lag" "2024-06-01_rain.sum.lag"
#Extract to the point dataset
extr1 <- terra::extract(rstack3, mypts, method = "bilinear")
mypts <- cbind(mypts, extr1)
# Remove the ID column from the dataset
mypts <- mypts[, !names(mypts) %in% "ID"]
head(mypts)
##          Region   Market Month Year Latitude Longitude  Crop    pkg maize rice
## 1        Arusha   Arusha     5 2021 -3.36968  36.68808 Maize 469.00     1    0
## 2 Dar es Salaam   Temeke     5 2021 -6.85097  39.25672 Maize 485.00     1    0
## 3        Dodoma  Majengo     5 2021 -6.17971  35.74109 Maize 501.50     1    0
## 4        Dodoma Kibaigwa     5 2021 -6.08110  36.64645 Maize 375.00     1    0
## 5        Kagera   Bukoba     5 2021 -1.33140  31.81293 Maize 426.25     1    0
## 6       Manyara   Babati     5 2021 -4.21006  35.74915 Maize 375.00     1    0
##   sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 2       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 3       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 4       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 5       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 6       0       0       0     0     0      0   0   0   0   0   1   0   0   0
##   sep oct nov dec   ttcity_u5 ttport_1    bio_1     bio_2    bio_3     bio_4
## 1   0   0   0   0   6.5704844 1405.327 19.27741 10.934885 68.03365 169.49426
## 2   0   0   0   0   0.5137705 1692.300 25.93087  8.970697 67.33757 153.60644
## 3   0   0   0   0  11.0485593 1752.489 22.37704 12.001499 69.67643 153.46371
## 4   0   0   0   0  83.7810446 1788.311 20.99961 12.303768 70.91631 155.25267
## 5   0   0   0   0  17.1729083 2006.820 20.84789  8.869465 83.92802  38.70414
## 6   0   0   0   0 133.5790574 1526.875 19.98732 10.788815 71.03883 157.18046
##      bio_5    bio_6    bio_7    bio_8    bio_9   bio_10   bio_11    bio_12
## 1 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375  882.1826
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572  582.4022
## 4 29.32247 11.97287 17.34961 22.51508 19.21250 22.53832 18.93717  649.1973
## 5 25.68171 15.11708 10.56463 21.07716 20.26752 21.19770 20.26752 1905.0292
## 6 27.34785 12.16727 15.18059 21.20938 18.19802 21.26028 17.67068  768.0339
##     bio_13      bio_14    bio_15   bio_16       bio_17   bio_18       bio_19
## 1 237.8160  7.36089814  92.10601 482.4729  24.95659498 276.6105  32.88688141
## 2 264.3072 28.32696315  76.22544 594.2531  90.46537640 268.0798 105.37909361
## 3 133.9309  0.01660535 115.12207 375.4677   0.08605806 284.0681   0.08605806
## 4 129.1152  0.21909479 100.22821 368.5410   2.45863517 320.1588   4.15077049
## 5 346.9308 43.30159629  58.29730 857.7995 171.49365306 631.5838 171.49365306
## 6 149.2006  0.91639794  89.44063 381.6907   6.74877687 325.6114  12.35086025
##        MAI_ARE      MAI_YLD   popdens latitude longitude
## 1 3.107393e+02 1.528850e+02  994.5706 -3.36968  36.68808
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097  39.25672
## 3 1.283795e-04 9.139631e-03  352.5410 -6.17971  35.74109
## 4 1.489781e+03 3.828877e+02  101.0565 -6.08110  36.64645
## 5 6.446327e+02 1.834932e+03  280.9803 -1.33140  31.81293
## 6 5.496486e+02 1.917829e+03  219.1393 -4.21006  35.74915
##   2021-05-01_rain.sum.lag 2021-06-01_rain.sum.lag 2021-07-01_rain.sum.lag
## 1                541.4019                482.1614                435.6677
## 2                809.3149                777.8958                722.6213
## 3                637.1213                535.1131                365.9182
## 4                656.0751                567.7319                416.4267
## 5               1149.9932                937.6197                853.1074
## 6                576.1356                490.7897                395.8152
##   2021-08-01_rain.sum.lag 2021-09-01_rain.sum.lag 2021-10-01_rain.sum.lag
## 1                386.3060               320.00676               161.93116
## 2                649.9489               604.59631               337.22359
## 3                183.7701                52.54273                15.42844
## 4                277.1030               153.91512                66.15258
## 5                811.6691               763.24715               615.07899
## 6                289.9778               154.11524                68.29101
##   2021-11-01_rain.sum.lag 2021-12-01_rain.sum.lag 2022-01-01_rain.sum.lag
## 1               101.83447               179.80052                232.6533
## 2               216.69636               288.29559                406.3529
## 3                19.66440                70.12616                416.5519
## 4                49.56118               106.72427                413.7123
## 5               488.90052               605.55387                808.6724
## 6                46.47620               108.69646                245.5580
##   2022-02-01_rain.sum.lag 2022-03-01_rain.sum.lag 2022-04-01_rain.sum.lag
## 1                330.6131                383.2848                540.3073
## 2                490.8561                534.5386                763.3206
## 3                817.0083                917.8080               1000.7087
## 4                665.6779                764.1859                927.5299
## 5                899.5895                957.6149               1138.6710
## 6                441.5905                510.5650                641.9617
##   2022-05-01_rain.sum.lag 2022-06-01_rain.sum.lag 2022-07-01_rain.sum.lag
## 1                530.1299                455.6274                403.9131
## 2                816.5829                729.6579                636.5197
## 3                997.3701                947.9434                601.5170
## 4                958.6012                903.3041                595.9210
## 5               1219.1735               1115.3495                905.1190
## 6                643.2443                581.5847                444.9141
##   2022-08-01_rain.sum.lag 2022-09-01_rain.sum.lag 2022-10-01_rain.sum.lag
## 1                305.5547                252.9373               100.53888
## 2                552.9257                495.7899               268.19916
## 3                201.0583                100.2519                14.64072
## 4                344.1166                244.4002                79.55153
## 5                853.5636                777.4608               541.86704
## 6                248.8724                179.9022                48.53265
##   2022-11-01_rain.sum.lag 2022-12-01_rain.sum.lag 2023-01-01_rain.sum.lag
## 1               118.72860                197.4710                209.2468
## 2               249.91483                366.5402                424.9013
## 3                24.33803                210.4590                364.0985
## 4                63.76563                237.8550                386.3680
## 5               469.63839                598.3139                701.7206
## 6                63.04448                191.9892                242.6080
##   2023-02-01_rain.sum.lag 2023-03-01_rain.sum.lag 2023-04-01_rain.sum.lag
## 1                215.1493                279.3411                509.3222
## 2                470.6298                548.0837                839.2660
## 3                459.6726                564.8242                612.5806
## 4                488.0359                587.2738                698.1311
## 5                667.8848                832.3617                977.3868
## 6                281.6344                387.7473                521.9884
##   2023-05-01_rain.sum.lag 2023-06-01_rain.sum.lag 2023-07-01_rain.sum.lag
## 1                506.2685                448.0443                434.9563
## 2                867.6821                794.6204                712.8424
## 3                603.2209                416.3525                262.7130
## 4                703.3618                531.7645                383.2904
## 5                939.2760                855.6117                744.2889
## 6                509.0339                383.3928                332.5788
##   2023-08-01_rain.sum.lag 2023-09-01_rain.sum.lag 2023-10-01_rain.sum.lag
## 1                430.3774               366.50878               151.77184
## 2                671.7647               605.81817               375.59469
## 3                167.1400                61.98845                14.85932
## 4                281.9979               182.81237                74.83833
## 5                701.5641               576.55398               478.95586
## 6                293.7072               187.82502                55.57329
##   2023-11-01_rain.sum.lag 2023-12-01_rain.sum.lag 2024-01-01_rain.sum.lag
## 1               265.66469                299.4620                435.0957
## 2               742.63087                865.3863               1088.0394
## 3                61.64418                263.3634                560.4381
## 4               140.44970                310.9307                640.9717
## 5               677.93283                840.0104                941.7438
## 6               137.00931                245.0625                448.6259
##   2024-02-01_rain.sum.lag 2024-03-01_rain.sum.lag 2024-04-01_rain.sum.lag
## 1                531.3442                668.3936                896.7951
## 2               1108.6897               1277.2697               1582.9947
## 3                754.5153                856.2409                923.1968
## 4                804.6320                935.2507               1026.7195
## 5               1023.2543               1002.7834               1183.9751
## 6                603.8745                714.4857                863.0247
##   2024-05-01_rain.sum.lag 2024-06-01_rain.sum.lag
## 1                821.3999                770.3854
## 2               1235.6993               1103.4509
## 3                875.1995                673.5048
## 4                951.3725                780.1154
## 5               1003.6453                792.8721
## 6                793.5998                682.0563

Here we extract sum of lag rainfall for each row under the column rain.sum.lag

mypts_df <- as.data.frame(mypts)

# Define the function to obtain sum of lag rainfall from corresponding rasters to mypts under rain.sum.lag column (for each row)
# Each extraction has to match the month and year
get_rain_sum_row <- function(current_date, mypts_row) {
  # Extract the rainfall value for the current date
  rain_sum <- mypts_row[[paste0(current_date, "_rain.sum.lag")]]
  return(rain_sum)
}

# Loop through each row and obtain the rainfall sum for each month and year
for (i in 1:nrow(mypts_df)) {
  # Extract relevant data for the current row
  month <- mypts_df$Month[i]
  year <- mypts_df$Year[i]
  current_date <- paste0(year, "-", sprintf("%02d", month), "-01")  # Format date correctly
  # Pass necessary data to the function
  rain_sum <- get_rain_sum_row(current_date, mypts_df[i, ])
  # Update the rain.sum.lag column
  mypts_df$rain.sum.lag[i] <- rain_sum
}

# Update the SpatVector with the new rain.avg column
mypts$rain.sum.lag <- mypts_df$rain.sum.lag
# I'll drop the dates with rain.sum.lag from mypts, seems redundant
column_indices <- grep("^202[0-4]-", names(mypts))
mypts <- mypts[, -column_indices]
names(mypts)
##  [1] "Region"       "Market"       "Month"        "Year"         "Latitude"    
##  [6] "Longitude"    "Crop"         "pkg"          "maize"        "rice"        
## [11] "sorghum"      "bmillet"      "fmillet"      "wheat"        "beans"       
## [16] "potato"       "jan"          "feb"          "mar"          "apr"         
## [21] "may"          "jun"          "jul"          "aug"          "sep"         
## [26] "oct"          "nov"          "dec"          "ttcity_u5"    "ttport_1"    
## [31] "bio_1"        "bio_2"        "bio_3"        "bio_4"        "bio_5"       
## [36] "bio_6"        "bio_7"        "bio_8"        "bio_9"        "bio_10"      
## [41] "bio_11"       "bio_12"       "bio_13"       "bio_14"       "bio_15"      
## [46] "bio_16"       "bio_17"       "bio_18"       "bio_19"       "MAI_ARE"     
## [51] "MAI_YLD"      "popdens"      "latitude"     "longitude"    "rain.sum.lag"
#define Month as a factor
#mypts$Month <- as.factor(mypts$Month)
#levels(mypts$Month)

#We'll define month as an interger instead.
# drop levels that don't exist in Crop field
mypts$Crop <- mypts$Crop[,drop=TRUE]
levels(mypts$Crop)
## [1] "Maize"    "Rice"     "Sorghum"  "B.Millet" "F.Millet" "Wheat"    "Beans"   
## [8] "Potato"

Linear model price Prediction

Coefficient estimates from the linear model provide a detailed insight into the relationship between each predictor and the response variable.

# Fit the linear model
lm_model <- lm(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
                 Month +
                 Year + 
                 ttcity_u5 + ttport_1 + 
                 bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + 
                 bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14                  + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 +
                 MAI_ARE + MAI_YLD + 
                 Longitude + Latitude + 
                 rain.sum.lag,
               data = mypts)

summary(lm_model)
## 
## Call:
## lm(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + 
##     wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 + 
##     bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 + 
##     bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + 
##     bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude + 
##     Latitude + rain.sum.lag, data = mypts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1491.81  -239.93   -10.63   235.20  1602.93 
## 
## Coefficients: (2 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4.592e+05  1.221e+04 -37.593  < 2e-16 ***
## maize        -5.132e+01  1.850e+01  -2.774 0.005549 ** 
## rice          1.397e+03  1.850e+01  75.482  < 2e-16 ***
## sorghum       3.847e+02  1.957e+01  19.660  < 2e-16 ***
## bmillet       4.745e+02  2.103e+01  22.556  < 2e-16 ***
## fmillet       7.905e+02  1.933e+01  40.885  < 2e-16 ***
## wheat         9.527e+02  2.126e+01  44.821  < 2e-16 ***
## beans         1.516e+03  1.852e+01  81.852  < 2e-16 ***
## potato               NA         NA      NA       NA    
## Month         6.676e+00  1.886e+00   3.540 0.000403 ***
## Year          2.186e+02  5.854e+00  37.343  < 2e-16 ***
## ttcity_u5     1.881e+00  1.588e-01  11.850  < 2e-16 ***
## ttport_1     -1.918e+00  1.685e-01 -11.388  < 2e-16 ***
## bio_1        -2.372e+03  1.891e+02 -12.542  < 2e-16 ***
## bio_2        -2.675e+03  2.351e+02 -11.377  < 2e-16 ***
## bio_3         2.262e+02  1.925e+01  11.747  < 2e-16 ***
## bio_4         2.172e+01  4.038e+00   5.380 7.73e-08 ***
## bio_5         1.992e+03  1.793e+02  11.107  < 2e-16 ***
## bio_6        -1.950e+03  1.923e+02 -10.139  < 2e-16 ***
## bio_7                NA         NA      NA       NA    
## bio_8         9.181e+02  9.258e+01   9.917  < 2e-16 ***
## bio_9        -6.182e+01  7.954e+01  -0.777 0.437097    
## bio_10       -1.244e+03  2.052e+02  -6.061 1.43e-09 ***
## bio_11        2.754e+03  3.005e+02   9.165  < 2e-16 ***
## bio_12        1.155e+00  2.783e-01   4.150 3.38e-05 ***
## bio_13       -7.293e-02  9.213e-01  -0.079 0.936911    
## bio_14        5.699e+01  1.259e+01   4.526 6.14e-06 ***
## bio_15        2.089e+01  2.133e+00   9.796  < 2e-16 ***
## bio_16       -1.014e+00  7.661e-01  -1.323 0.185851    
## bio_17       -1.791e+01  3.588e+00  -4.991 6.19e-07 ***
## bio_18        1.459e-01  2.512e-01   0.581 0.561329    
## bio_19        4.667e+00  2.700e+00   1.729 0.083933 .  
## MAI_ARE      -2.628e-02  3.250e-02  -0.808 0.418874    
## MAI_YLD       1.432e-01  1.704e-02   8.406  < 2e-16 ***
## Longitude     9.447e+01  3.530e+01   2.676 0.007466 ** 
## Latitude      1.300e+01  3.535e+01   0.368 0.713134    
## rain.sum.lag -2.220e-01  1.938e-02 -11.453  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 374.6 on 5718 degrees of freedom
## Multiple R-squared:  0.7403, Adjusted R-squared:  0.7388 
## F-statistic: 479.5 on 34 and 5718 DF,  p-value: < 2.2e-16

RandomForest and TPS

Random Forest price prediction

First check if there are any predictors with NA values

for(column in seq_along(mypts)){
  if(any(is.na(mypts[column]))){
    print(paste0("Column: ", colnames(mypts)[column], " has at least one NA value"))
  }
}

#There are no columns with missing values

Split data the to be used for Training and validation

Data from May 2021 - Dec 2023 will be used for model training while more recent data from Jan - June 2024 will be used for Validation.

# Filter the data for training (May 2021 - Dec 2023)
training_data <- mypts[mypts$Year %in% c(2021, 2022, 2023), ]
# Check training data
head(training_data)
##          Region   Market Month Year Latitude Longitude  Crop    pkg maize rice
## 1        Arusha   Arusha     5 2021 -3.36968  36.68808 Maize 469.00     1    0
## 2 Dar es Salaam   Temeke     5 2021 -6.85097  39.25672 Maize 485.00     1    0
## 3        Dodoma  Majengo     5 2021 -6.17971  35.74109 Maize 501.50     1    0
## 4        Dodoma Kibaigwa     5 2021 -6.08110  36.64645 Maize 375.00     1    0
## 5        Kagera   Bukoba     5 2021 -1.33140  31.81293 Maize 426.25     1    0
## 6       Manyara   Babati     5 2021 -4.21006  35.74915 Maize 375.00     1    0
##   sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 2       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 3       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 4       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 5       0       0       0     0     0      0   0   0   0   0   1   0   0   0
## 6       0       0       0     0     0      0   0   0   0   0   1   0   0   0
##   sep oct nov dec   ttcity_u5 ttport_1    bio_1     bio_2    bio_3     bio_4
## 1   0   0   0   0   6.5704844 1405.327 19.27741 10.934885 68.03365 169.49426
## 2   0   0   0   0   0.5137705 1692.300 25.93087  8.970697 67.33757 153.60644
## 3   0   0   0   0  11.0485593 1752.489 22.37704 12.001499 69.67643 153.46371
## 4   0   0   0   0  83.7810446 1788.311 20.99961 12.303768 70.91631 155.25267
## 5   0   0   0   0  17.1729083 2006.820 20.84789  8.869465 83.92802  38.70414
## 6   0   0   0   0 133.5790574 1526.875 19.98732 10.788815 71.03883 157.18046
##      bio_5    bio_6    bio_7    bio_8    bio_9   bio_10   bio_11    bio_12
## 1 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375  882.1826
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572  582.4022
## 4 29.32247 11.97287 17.34961 22.51508 19.21250 22.53832 18.93717  649.1973
## 5 25.68171 15.11708 10.56463 21.07716 20.26752 21.19770 20.26752 1905.0292
## 6 27.34785 12.16727 15.18059 21.20938 18.19802 21.26028 17.67068  768.0339
##     bio_13      bio_14    bio_15   bio_16       bio_17   bio_18       bio_19
## 1 237.8160  7.36089814  92.10601 482.4729  24.95659498 276.6105  32.88688141
## 2 264.3072 28.32696315  76.22544 594.2531  90.46537640 268.0798 105.37909361
## 3 133.9309  0.01660535 115.12207 375.4677   0.08605806 284.0681   0.08605806
## 4 129.1152  0.21909479 100.22821 368.5410   2.45863517 320.1588   4.15077049
## 5 346.9308 43.30159629  58.29730 857.7995 171.49365306 631.5838 171.49365306
## 6 149.2006  0.91639794  89.44063 381.6907   6.74877687 325.6114  12.35086025
##        MAI_ARE      MAI_YLD   popdens latitude longitude rain.sum.lag
## 1 3.107393e+02 1.528850e+02  994.5706 -3.36968  36.68808     541.4019
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097  39.25672     809.3149
## 3 1.283795e-04 9.139631e-03  352.5410 -6.17971  35.74109     637.1213
## 4 1.489781e+03 3.828877e+02  101.0565 -6.08110  36.64645     656.0751
## 5 6.446327e+02 1.834932e+03  280.9803 -1.33140  31.81293    1149.9932
## 6 5.496486e+02 1.917829e+03  219.1393 -4.21006  35.74915     576.1356
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
head(validation_data)
##          Region  Market Month Year Latitude Longitude  Crop      pkg maize rice
## 1 Dar es Salaam   Ilala     1 2024 -6.82941  39.26289 Maize 775.0000     1    0
## 2 Dar es Salaam  Temeke     1 2024 -6.85097  39.25672 Maize 850.0000     1    0
## 3   Kilimanjaro   Moshi     1 2024 -3.34865  37.34352 Maize 850.0000     1    0
## 4       Singida Singida     1 2024 -4.81261  34.74280 Maize 700.0000     1    0
## 5        Arusha  Arusha     1 2024 -3.36968  36.68808 Maize 709.0909     1    0
## 6        Dodoma Majengo     1 2024 -6.17971  35.74109 Maize 690.4545     1    0
##   sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 2       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 3       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 4       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 5       0       0       0     0     0      0   1   0   0   0   0   0   0   0
## 6       0       0       0     0     0      0   1   0   0   0   0   0   0   0
##   sep oct nov dec   ttcity_u5 ttport_1    bio_1     bio_2    bio_3    bio_4
## 1   0   0   0   0   0.4051143 1689.367 25.92678  8.978264 67.42460 152.9108
## 2   0   0   0   0   0.5137705 1692.300 25.93087  8.970697 67.33757 153.6064
## 3   0   0   0   0  11.3442758 1450.131 22.29721 11.005590 67.75448 180.7284
## 4   0   0   0   0 165.2321068 1606.655 20.38367 11.683626 71.57368 137.7943
## 5   0   0   0   0   6.5704844 1405.327 19.27741 10.934885 68.03365 169.4943
## 6   0   0   0   0  11.0485593 1752.489 22.37704 12.001499 69.67643 153.4637
##      bio_5    bio_6    bio_7    bio_8    bio_9   bio_10   bio_11    bio_12
## 1 32.19589 18.88015 13.31574 26.54073 24.05313 27.71579 24.02143 1122.8900
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 31.30237 15.06427 16.23809 22.88764 20.26380 24.25183 19.85315  938.0520
## 4 28.10475 11.78088 16.32386 21.29394 18.36286 21.69755 18.36286  662.1342
## 5 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375  882.1826
## 6 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572  582.4022
##     bio_13      bio_14    bio_15   bio_16      bio_17   bio_18       bio_19
## 1 258.0938 28.00268809  75.59542 582.2198 89.31864258 266.4441 103.38645312
## 2 264.3072 28.32696315  76.22544 594.2531 90.46537640 268.0798 105.37909361
## 3 277.1083 14.38640622  97.50828 541.7736 53.36366989 211.4703  72.58868003
## 4 140.9090  0.00000000 104.78967 387.3265  0.31403068 192.0734   0.31403068
## 5 237.8160  7.36089814  92.10601 482.4729 24.95659498 276.6105  32.88688141
## 6 133.9309  0.01660535 115.12207 375.4677  0.08605806 284.0681   0.08605806
##        MAI_ARE      MAI_YLD   popdens latitude longitude rain.sum.lag
## 1 3.490943e-03 1.426787e-02 8044.9216 -6.82941  39.26289    1085.9139
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097  39.25672    1088.0394
## 3 1.654091e+03 3.030923e+03  522.6697 -3.34865  37.34352     508.5944
## 4 1.631720e+01 1.052680e+03  254.1591 -4.81261  34.74280     613.3819
## 5 3.107393e+02 1.528850e+02  994.5706 -3.36968  36.68808     435.0957
## 6 1.283795e-04 9.139631e-03  352.5410 -6.17971  35.74109     560.4381

Random Forest for generating variable of importance

Tune The Forest

The tuneRF function in the randomForest package is used to tune the mtry parameter, which is the number of variables randomly sampled as candidates at each split in the random forest. The function requires a data frame of predictor variables and a response variable.

# Convert training_data data to data frame
mypts_df <- as.data.frame(training_data)

trf <- tuneRF(x=mypts_df[,1:ncol(mypts_df)], # Prediction variables
              y=mypts_df$pkg) # Response variable
## mtry = 18  OOB error = 1330.723 
## Searching left ...
## mtry = 9     OOB error = 6914.108 
## -4.195754 0.05 
## Searching right ...
## mtry = 36    OOB error = 164.153 
## 0.8766437 0.05 
## mtry = 55    OOB error = 96.79918 
## 0.4103114 0.05

Based on the output from tuneRF, you can observe that the mtry value that gives the lowest Out-of-Bag (OOB) error. To build the first random forest model, we will use this mtry value.

(mintree <- trf[which.min(trf[,2]),1])
## [1] 55

Fit The Random Forest Model (1)

Random Forest for generating variable of importance

Here, the model is fitted using the randomForest function to build a predictive model for food commodity prices. The model takes the response variable, the prediction variables and the optimal number of variables to consider at each split. The goal is to generate Variable importance scores which will help us understand which variables have the most significant impact on the response variable, enabling us to interpret the model and possibly simplify it by focusing on the most important predictors.

# Create the random forest model
rf1 <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
                      Month + 
                      Year +
                      ttcity_u5 + ttport_1 + 
                      bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + 
                      bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 +                        bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 + 
                      MAI_ARE + MAI_YLD + 
                      Longitude + Latitude + 
                      rain.sum.lag,
                    data = training_data,mtry=mintree,importance=TRUE,na.rm=TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rf1
## 
## Call:
##  randomForest(formula = pkg ~ maize + rice + sorghum + bmillet +      fmillet + wheat + beans + potato + Month + Year + ttcity_u5 +      ttport_1 + bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +      bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 +      bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE +      MAI_YLD + Longitude + Latitude + rain.sum.lag, data = training_data,      mtry = mintree, importance = TRUE, na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 36
## 
##           Mean of squared residuals: 24367.66
##                     % Var explained: 95.33
varImpPlot(rf1)

## evaluate
(oob <- sqrt(rf1$mse[which.min(rf1$mse)]))
## [1] 155.9889

This calculates the RMSE of the tree in the Random Forest model that has the lowest OOB error.

importance_metrics <- importance(rf1, type=1)  # %IncMSE
impvar <- rownames(importance_metrics)[order(importance_metrics[, 1], decreasing=TRUE)]
impvar
##  [1] "Year"         "Month"        "beans"        "rice"         "Longitude"   
##  [6] "sorghum"      "rain.sum.lag" "wheat"        "fmillet"      "bmillet"     
## [11] "bio_12"       "bio_18"       "MAI_YLD"      "Latitude"     "ttcity_u5"   
## [16] "bio_15"       "MAI_ARE"      "ttport_1"     "bio_3"        "bio_2"       
## [21] "bio_8"        "bio_1"        "bio_5"        "bio_4"        "bio_13"      
## [26] "bio_7"        "bio_16"       "bio_10"       "bio_11"       "bio_19"      
## [31] "bio_14"       "maize"        "bio_6"        "potato"       "bio_9"       
## [36] "bio_17"
# Get the top 20 variables
top_20_vars <- impvar[1:20]
top_20_vars
##  [1] "Year"         "Month"        "beans"        "rice"         "Longitude"   
##  [6] "sorghum"      "rain.sum.lag" "wheat"        "fmillet"      "bmillet"     
## [11] "bio_12"       "bio_18"       "MAI_YLD"      "Latitude"     "ttcity_u5"   
## [16] "bio_15"       "MAI_ARE"      "ttport_1"     "bio_3"        "bio_2"
node_purity <- importance(rf1, type=2)  # IncNodePurity
# Sort variables by importance (IncNodePurity)
node_purity_sorted <- sort(node_purity[,1], decreasing = TRUE)
node_purity_sorted
##         rice        beans         Year        wheat      fmillet        Month 
##    545262150    472837663    380711166    130780343    130524754    118049778 
##       potato       bio_12        maize rain.sum.lag    Longitude      sorghum 
##     48764484     46876442     43269385     41339371     40493227     38007061 
##      bmillet        bio_3        bio_7      MAI_YLD    ttcity_u5       bio_18 
##     37037032     26611939     25621683     23047045     20340459     17334860 
##     Latitude       bio_15      MAI_ARE     ttport_1        bio_6        bio_2 
##     13870649     13354723     12562599     11705550      9671336      8905350 
##        bio_5        bio_4        bio_9       bio_10       bio_19       bio_13 
##      7887209      7186178      6312415      5692600      5546136      5381328 
##        bio_8        bio_1       bio_11       bio_16       bio_14       bio_17 
##      5190873      5156708      4920759      4912567      4732035      3084859
# Select the top 20 important variables
top_vars <- names(node_purity_sorted)[1:20]
print(top_vars)
##  [1] "rice"         "beans"        "Year"         "wheat"        "fmillet"     
##  [6] "Month"        "potato"       "bio_12"       "maize"        "rain.sum.lag"
## [11] "Longitude"    "sorghum"      "bmillet"      "bio_3"        "bio_7"       
## [16] "MAI_YLD"      "ttcity_u5"    "bio_18"       "Latitude"     "bio_15"
rf1$importanceSD
##        maize         rice      sorghum      bmillet      fmillet        wheat 
##    1882.1577    2770.9228     457.2999     630.7592    1639.2469    1556.6641 
##        beans       potato        Month         Year    ttcity_u5     ttport_1 
##    2955.5197    1928.6726     273.3616     488.2066     277.1534     218.3697 
##        bio_1        bio_2        bio_3        bio_4        bio_5        bio_6 
##     120.2380     214.5229     584.6667     181.9721     168.0418     316.5164 
##        bio_7        bio_8        bio_9       bio_10       bio_11       bio_12 
##     722.7443     115.8098     279.8230     155.6259     137.4150     828.6967 
##       bio_13       bio_14       bio_15       bio_16       bio_17       bio_18 
##     134.2437     129.5546     289.8310     147.2297     137.7304     274.4271 
##       bio_19      MAI_ARE      MAI_YLD    Longitude     Latitude rain.sum.lag 
##     195.6188     231.7859     433.6816     322.8656     276.0473     168.4705

Fit The Random Forest Model (2)

Estimate more parsimonious specification

In this section, we aim to refine our model by selecting the most important variables. We will review the importance metrics (%IncMSE and IncNodePurity) to identify the variables that contribute the most to the model’s predictive power. Our focus will be on variables with higher importance values to ensure a more efficient and interpretable model.

# Estimate more parsimonious specification
rf <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
                     Month +
                     Year + 
                     ttport_1 +
                     bio_3 + bio_6  + bio_9 + bio_12 + 
                     rain.sum.lag, 
                   data=training_data, na.rm=TRUE)

rf
## 
## Call:
##  randomForest(formula = pkg ~ maize + rice + sorghum + bmillet +      fmillet + wheat + beans + potato + Month + Year + ttport_1 +      bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data,      na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 29815.16
##                     % Var explained: 94.29
# evaluate
varImpPlot(rf)

(oob <- sqrt(rf$mse[which.min(rf$mse)]))
## [1] 172.539
partialPlot(rf, as.data.frame(training_data), "rain.sum.lag")

spatial prediction

These are prediction plots for each food commodities (maize, beans, rice, sorghum, bmillet, fmillet, wheat, potato) with their respective month of the year 2023.

year <- 2023

Note: we must set the rain.sum.lag variables for each month we are predicting

Predicted Maize Prices

#Maize
# Create an empty list to store predictions for maize
predict_for_month <- function(month){
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")] # Remember to change depending on year
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_maize <- data.frame(
    maize = 1, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_maize, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_maize <- lapply(1:12, predict_for_month)


# Extract pixel values from predictions_maize
maize_values <- unlist(lapply(predictions_maize, values))
# Get min and max values
min_maize <- min(maize_values, na.rm = TRUE)
max_maize <- max(maize_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 100 

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 100

# Create a 3x4 matrix of plots
par(mar = c(0, 0, 0, 0))  # Set margins to 0 for inner plots
for (i in 1:12) {
  plot(predictions_maize[[i]], main = paste("Maize prices", toupper(i), year),
       zlim = c(min_maize, max_maize), col = color_palette, breaks = seq(min_maize, max_maize, by = break_interval), legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_maize, max_maize), n = 4)

# Reset plot layout for the legend
layout(matrix(1))
par(mar = c(5, 4, 2, 1))

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_maize, max_maize), legend.only = TRUE,
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9,
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
           legend.args = list(text = "Predicted Maize Price (Tsh)", side = 1, line = 2, cex = 0.9))

Predicted Beans Prices

# Beans
# Function to predict beans for a given month
predict_for_beans <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"

  newstack <- c(rstack, rain_sum_lag)

  const_beans <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 1, potato = 0,
    Month = month,
    Year = year
  )

  pred <- predict(newstack, rf, const = const_beans, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_beans <- lapply(1:12, predict_for_beans)

# Extract pixel values from predictions_beans
bean_values <- unlist(lapply(predictions_beans, values))
min_bean <- min(bean_values, na.rm = TRUE)
max_bean <- max(bean_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 150 

# Loop through each month to plot beans prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_beans[[i]], main = paste("Beans prices", toupper(i), year), 
       zlim = c(min_bean, max_bean), col = color_palette, breaks = seq(min_bean, max_bean, by = break_interval), legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bean, max_bean), n = 4)

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bean, max_bean), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Beans Price (Tsh)", side = 1, line = 2, cex = 0.9))

Predicted Rice Prices

# Rice
# Function to predict rice prices for a given month
predict_for_rice <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_rice <- data.frame(
    maize = 0, rice = 1, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_rice, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_rice <- lapply(1:12, predict_for_rice)

# Extract pixel values from predictions_rice
rice_values <- unlist(lapply(predictions_rice, values))
min_rice <- min(rice_values, na.rm = TRUE)
max_rice <- max(rice_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 150 

# Loop through each month to plot rice prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_rice[[i]], main = paste("Rice prices", toupper(i), year), 
       zlim = c(min_rice, max_rice), col = color_palette, breaks = seq(min_rice, max_rice, by = break_interval), legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_rice, max_rice), n = 5)

# Reset plot layout to 1x1 for the legend
layout(matrix(1)) 

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_rice, max_rice), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Rice Price (Tsh)", side = 1, line = 2, cex = 0.9))

Predicted Sorghum Prices

# Function to predict sorghum prices for a given month
predict_for_sorghum <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_sorghum <- data.frame(
    maize = 0, rice = 0, sorghum = 1, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = 2023
  )
  
  pred <- predict(newstack, rf, const = const_sorghum, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_sorghum <- lapply(1:12, predict_for_sorghum)

# Extract pixel values from predictions_sorghum
sorghum_values <- unlist(lapply(predictions_sorghum, values))
min_sorghum <- min(sorghum_values, na.rm = TRUE)
max_sorghum <- max(sorghum_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 300 

# Loop through each month to plot sorghum prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_sorghum[[i]], main = paste("Sorghum prices", toupper(i), year), 
       zlim = c(min_sorghum, max_sorghum), col = color_palette, breaks = seq(min_sorghum, max_sorghum, by = break_interval), legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_sorghum, max_sorghum), n = 5)

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_sorghum, max_sorghum), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Sorghum Price (Tsh)", side = 1, line = 2, cex = 0.9))

Predicted Bulrush Millet Prices

# bmillet
# Function to predict bmillet prices for a given month
predict_for_bmillet <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_bmillet <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 1, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_bmillet, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_bmillet <- lapply(1:12, predict_for_bmillet)


# Extract pixel values from predictions_bmillet
bmillet_values <- unlist(lapply(predictions_bmillet, values))
min_bmillet <- min(bmillet_values, na.rm = TRUE)
max_bmillet <- max(bmillet_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

# Loop through each month to plot bmillet prices
break_interval <- 150 
par(mar = c(0, 0, 0, 0)) 
for (i in 1:12) {
  plot(predictions_bmillet[[i]], main = paste("Bmillet prices", toupper(i), year), 
       zlim = c(min_bmillet, max_bmillet), col = color_palette, 
       breaks = seq(min_bmillet, max_bmillet, by = break_interval), 
       legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bmillet, max_bmillet), n = 5) 

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bmillet, max_bmillet), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Bulrush Millet Price (Tsh)", side = 1, line = 2, cex = 0.9))

Predicted Finger Millet Prices

#fmillet
# Function to predict fmillet prices for a given month
predict_for_fmillet <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_fmillet <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 1, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_fmillet, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_fmillet <- lapply(1:12, predict_for_fmillet)

# Extract pixel values from predictions_fmillet
fmillet_values <- unlist(lapply(predictions_fmillet, values))
min_fmillet <- min(fmillet_values, na.rm = TRUE)
max_fmillet <- max(fmillet_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

# Loop through each month to plot fmillet prices
break_interval <- 300 
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_fmillet[[i]], main = paste("Fmillet prices", toupper(i), year), 
       zlim = c(min_fmillet, max_fmillet), col = color_palette, 
       breaks = seq(min_fmillet, max_fmillet, by = break_interval), 
       legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_fmillet, max_fmillet), n = 3)  

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_fmillet, max_fmillet), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Finger Millet Price (Tsh)", side = 1, line = 2, cex = 0.9))

Predicted Wheat Prices

#Wheat
# Function to predict wheat prices for a given month
predict_for_wheat <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_wheat <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 1, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_wheat, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_wheat <- lapply(1:12, predict_for_wheat)
# Extract pixel values from predictions_wheat
wheat_values <- unlist(lapply(predictions_wheat, values))
min_wheat <- min(wheat_values, na.rm = TRUE)
max_wheat <- max(wheat_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Define the break interval for both plot and legend
break_interval <- 100

# Loop through each month to plot wheat prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_wheat[[i]], main = paste("Wheat prices", toupper(i), year), 
       zlim = c(min_wheat, max_wheat), col = color_palette, 
       breaks = seq(min_wheat, max_wheat, by = break_interval), 
       legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_wheat, max_wheat), n = 5)

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_wheat, max_wheat), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Wheat Price (Tsh)", side = 1, line = 2, cex = 0.9))

Predicted potato prices

#potatoes
# Function to predict potato prices for a given month
predict_for_potato <- function(month) {
  rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_potato <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 1,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_potato, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_potato <- lapply(1:12, predict_for_potato)

# Extract pixel values from predictions_potato
potato_values <- unlist(lapply(predictions_potato, values))
min_potato <- min(potato_values, na.rm = TRUE)
max_potato <- max(potato_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

# Loop through each month to plot potato prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
  plot(predictions_potato[[i]], main = paste("Potato prices", toupper(i), year), 
       zlim = c(min_potato, max_potato), col = color_palette, 
       breaks = seq(min_potato, max_potato, by = break_interval), 
       legend = FALSE, axes = FALSE)
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_potato, max_potato), n = 5)  # Adjust n as needed

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_potato, max_potato), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Potato Price (Tsh)", side = 1, line = 2, cex = 0.9))

Prediction Evaluation

1. Using Validation data

pred<-predict(object=rf, newdata=validation_data)
actual<-validation_data$pkg
result<-data.frame(actual=actual, predicted=pred)
mse <- mean((actual - pred)^2, na.rm=TRUE)
paste('Mean Squared Error:', mse)
## [1] "Mean Squared Error: 136117.087211266"
rmse <- sqrt(mse)
paste('Root Mean Squared error: ',mean(sqrt(rf$mse)))
## [1] "Root Mean Squared error:  176.24848662951"
#Save predicted & observed yield
write.csv(result, "result.csv")
#reading result.csv file (predicted vs observed)
rslt <- read.csv("result.csv", header=T)
print(names(rslt))
## [1] "X"         "actual"    "predicted"
#RMSE predicting from rf - predicited vs observed 
rf.rmse<-round(sqrt(mean( (rslt$actual-rslt$predicted)^2 , na.rm = TRUE )),2)
print(rf.rmse)
## [1] 368.94
#R-square
rf.r2<-round(summary(lm(actual~predicted, rslt))$r.squared,3)
print(rf.r2)
## [1] 0.8
range(actual)
## [1]  350 4050
range(pred)
## [1]  631.9554 3268.8138
#plotting predicted Vs observed
ggplot(result, aes(x=actual, y=predicted), alpha=0.6) +
  geom_point(colour = "blue", size = 1.4, alpha=0.6) +
  ggtitle('Random Forest "Wholesale Grain Prices in Tanzania"') +
  scale_x_continuous("Observed Price (Tsh)",
                     limits = c(0, 5000),
                     breaks = seq(0, 5000, 1000)) +
  scale_y_continuous("Predicted Price (Tsh)",
                     limits = c(0, 5000),
                     breaks = seq(0, 5000, 1000)) +
  theme(axis.line = element_line(colour = "black"),
        axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
        axis.text.x = element_text(size = 8)) +
  geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
  geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
  annotate("text", x = 300, y = 4500, label = paste("RMSE:", rf.rmse)) +
  annotate("text", x = 300, y = 4200, label = paste("R^2: ", rf.r2), parse = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

2. Compare the observed Prices (the training data) with the predicted Prices (predicted using the training data) using stats package

library(stats)
mypts_df$pred <- stats::predict(rf)
rsq <- function (obs, pred) cor(obs, pred, use = 'complete.obs') ^ 2
RMSE <- function(obs, pred){sqrt(mean((pred - obs)^2, na.rm = TRUE))}
fr2_rsq <- rsq(mypts_df$pkg, mypts_df$pred) %>% round(digits = 2)
fr2_rmse <- RMSE(mypts_df$pkg, mypts_df$pred) %>% round(digits = 0)
Price_fit_plot <- ggplot(data = mypts_df, aes(x = pkg, y = pred)) +
  geom_point(colour = "blue", size = 1.4 ,alpha=0.6) + 
  ggtitle('Observed vs Predicted "Wholesale Grain Prices in Tanzania"') +
  geom_abline(slope = 1, alpha=0.3) +
  annotate('text', x = 150, y = 4500, label = paste0("R^{2}==", fr2_rsq), parse = TRUE, size=3)  +
  annotate('text', x = 150, y = 4200, label = paste0("RMSE==", fr2_rmse), parse = TRUE, size=3)  +
  labs(x = "Observed Price (Tsh)", y = "Predicted Price (Tsh)") +
  xlim(0, 5000) + ylim(0, 5000)
Price_fit_plot

Partial dependence plots

Plot Partial dependence of all the variables used except food commodities and months.

library(caret)

var_importance <- varImp(rf)

impvar <- rownames(var_importance)[order(var_importance[, 1], decreasing=TRUE)]
op <- par(mfrow=c(2, 4))
# exclude food commodities and months
predictors_to_plot <- setdiff(impvar, c("maize", "rice", "sorghum", "bmillet", "fmillet", "wheat", "beans", "potato", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))

for (i in seq_along(predictors_to_plot)) {
  partialPlot(rf, as.data.frame(training_data), predictors_to_plot[i], xlab=predictors_to_plot[i],
              main="Partial Dependence")
}

>>>>>>> 1b7f8f15e34c512dd5e6755836bcce4db40a8006 >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e